From 29bf4d24f9e7dd5aff2d2b81799cdf6399569c76 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 14 Jan 2025 18:19:48 +0000 Subject: [PATCH 01/46] update CK --- third_party/composable_kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/composable_kernel b/third_party/composable_kernel index 50ee4267e27b..c0b90f130f4c 160000 --- a/third_party/composable_kernel +++ b/third_party/composable_kernel @@ -1 +1 @@ -Subproject commit 50ee4267e27b875d149e642f4cebd47be1dc3b57 +Subproject commit c0b90f130f4cad7f1e7fc97c4d58d4798ecc2d47 From 412aeb5b592171571ddd01c28b4f7725cec00c73 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 14 Jan 2025 18:54:48 +0000 Subject: [PATCH 02/46] Remove generated files --- ...042c36bc588e60a7c8a9ba297a8a25d8ac0660.hip | 138 - ...29076f83a3dc695a167beda6fe19230a2b114b.hip | 138 - ...6c417a52a1bd7c55e45d111483d26f4480caeb.hip | 138 - ...8f2429c678d13386a06e8d8b15c4b480940ff3.hip | 73 - ...a2adbe938d458d51ca5fc4020667a215b672a4.hip | 138 - ...2c0f480917c329f4c3c6c666cf32af2d82b294.hip | 80 - ...4c209d5cfc6b965bfd78c64bf132c0154e32be.hip | 138 - ...53ec18d3ded0f8bdc6459ea5757ebd94d9faf2.hip | 73 - ...ac1a2ecf9a487809e46faa92e267df2d47de91.hip | 73 - ...ca79005067e20e4eed5a72ff9187cde702cd1c.hip | 138 - ...cb354dddef6e99e4ac843f2adafcddfc58d520.hip | 73 - ...d12033d59ce2799a2a024e5d9232325ccf1320.hip | 138 - ...d3b034a2d8d0b83c0aefa4faac6c3f28ce737f.hip | 138 - ...e2428c5447aa9a78f79f73f31cf685c586872d.hip | 138 - ...e8aedb7b7d77f44a46b2e9b7a826f245aaf4a7.hip | 80 - ...e8f0df0c54ce619e5b66441b3c96a5e18b05d6.hip | 138 - ...ee0083f6df962c4a754cd3295b1a436c590a0e.hip | 138 - ...f74764c3c3284fdd1b67d0ea781c2261ed0de6.hip | 138 - ...25857454eaab2eb664aef7a0849ce12c32fdf9.hip | 138 - ...37c76137df14fb808ade8bd6837045f2aaa5c9.hip | 138 - ...71bd8b7c270e1593871b638288a4923342c446.hip | 138 - ...d88a03cd3966dd0cff550065f58c3ffecfff6c.hip | 138 - ...ff94e3c787a7b06ffc90c25777fa74f225e32c.hip | 138 - ...0a759dcc92028b4c6f317fc230b98cb929e806.hip | 138 - ...1b12f9fd94e01aaff2c0da4f35f346822087e4.hip | 138 - ...6887daf6cc092e7422a17882488e59cecfb643.hip | 138 - ...7c6c80fcec3eb8b0bef50ad6af6d27bf5447f5.hip | 138 - ...92491c5a6dfc742c2be483419a40f6a7a7ea56.hip | 138 - ...a71615a088e972c998f9c7cb44566c268c5124.hip | 138 - ...ff035717140f7385282419598cb4fb2881ce8e.hip | 138 - ...1a0718891596ddac1fb0088637029233ccbe60.hip | 138 - ...2a156e9eb935555ab14a84461959b466c2fb5b.hip | 80 - ...641230fe9a50a221047f7a1df8a370f72805b9.hip | 138 - ...c363e11d202c6d2f4bb753661c5a2043edc0ad.hip | 138 - ...caeecbc01667ec6f5599358a0a20423aa9a00b.hip | 138 - ...f39b453505f68a5091f68b1c3de48369d1e7ea.hip | 138 - ...ffca078cfab8bc6c4ccd1cc8994a1bb4a88ea7.hip | 138 - ...02e718337eab7d47aa65cea7d3c5f641484520.hip | 138 - ...13b2f3bd8ad51315aadb7f63737201898adca8.hip | 138 - ...3981d9e7af2ebc0f91e61ac5e25cbe68c95bd8.hip | 80 - ...4fda16133a0d25077967b05425f9128e1fe1a5.hip | 138 - ...538339c21c92c53d237865d72debaaf2ee5075.hip | 138 - ...95316f0dfffda03e5296b959a49ec3f3c48d67.hip | 138 - ...dfe927fd64a564c5fad537fb7c41ee9c94c2c0.hip | 138 - ...e60b3ab7477f9edc8576a8bf43e3a62b8d5ef8.hip | 138 - ...f794c7023cbb7e35f1fd1ae45bd2377bfbc520.hip | 138 - ...28931bf5cc1daa6e106cf60bb21fa1aac6b1df.hip | 138 - ...2c8c3c1cf6c33af4574099e9b6ac54a55ad776.hip | 138 - ...82150e93f547e00f13cd8984779bf49b91e50c.hip | 80 - ...9c663be0267c009be4814e9e4e7c13ec999411.hip | 138 - ...ae52ef937cc27c544e32025ea0dadb7fad982d.hip | 138 - ...b74acd9abfbd1c4ec2f4c718eeb92a0bca7bab.hip | 138 - ...ba94794a14f0f0022af6f5f3c16e1e16959d4c.hip | 138 - ...1751b1012b90f7b57f8591cd06ae1fd27d9cd3.hip | 80 - ...66e7aa4b263a811408b285213e47176ee2bdaf.hip | 80 - ...6b3beb57b30afb30636f948e3989b346b38d20.hip | 138 - ...89852b0cd3cc030c78b28f2fd5b6b0546382a4.hip | 80 - ...8b96ad691a85eebd18586db0b62b8911016d9c.hip | 138 - ...c3fc96d2bebe546dce6ebf46e5c7a519959599.hip | 80 - ...ff04fcc273e469737512893ea3fb5876ac131d.hip | 138 - ...01c56831b4c6428200db6318638a2129bb197a.hip | 73 - ...36d5dfc0f939ab9a4064b403339373caf35b56.hip | 138 - ...42c4e3aabdf55405b3ce09ce1899245ddf11ad.hip | 138 - ...5722b43cde5f37242edb071f639da7c4a0bd48.hip | 138 - ...78b9aa31429d23a93cd953cc6a2fc5f43d0d3a.hip | 138 - ...9a347aef8a920e3b59d5ffe71fc5bfe002609c.hip | 138 - ...9de13222caec1483207d4a54249f8da4f9c151.hip | 138 - ...1cb49c1958fb4342d79f367ea93cf2b472f785.hip | 138 - ...3834d4d3fe76e1745e4482c6b51b550c6f3dfc.hip | 138 - ...513bff5c1da6aadf11d2e8272a422eabff21bc.hip | 138 - ...6863cd93d1b105a617d0daa1d4f37d7fb6b893.hip | 138 - ...68cebd81ade762c2f92fffc0153fa7a2b91eb5.hip | 138 - ...6e888c52d0f4a5847d7515fcc66208b1ff40d3.hip | 138 - ...7b3e1dae9bfb2e89398706508f8e01966fd4ea.hip | 138 - ...d76cca48b71dbcc9bd96734787209fee4c9a74.hip | 138 - ...e50367b62bb09071e28b44235a7c112645a706.hip | 138 - ...ecb6347009f6a5d5530a6acf90f9f40288cbcf.hip | 80 - ...2b116fd5065109aae46ee547e4f49ad0e9d6e1.hip | 80 - ...4e76d89b175e1d9fd2e9fb908d5fce1ebb945d.hip | 80 - ...55ed15ef58c941e06dda890aeb530e28eb7bba.hip | 138 - ...672fca51de618e3441cf8764e8e83eb782f2c7.hip | 138 - ...68c2f9a3acdd787b81be455cbc7836c8bfd90c.hip | 80 - ...89417a043556970f72eebd48b4f3e7ac15377a.hip | 138 - ...92671b6ea99891c0d69b1c793f4d131b9a82ed.hip | 138 - ...afb881e34a3794970a1282af740b3f19c138b1.hip | 80 - ...ce6e29e1d3060c3086c08fe27b471e375f9c75.hip | 80 - ...d9d68fcee021437e13ffdf94d78252205f5a31.hip | 80 - ...2647b5982405a48e8c8888552a4b89386ccdd9.hip | 80 - ...2efefea81036641561bed80c75d77651176f74.hip | 138 - ...3153af7bcdba33115a0d31f121fd76be2ffbcc.hip | 138 - ...532fcf26f90c82a792cde7943634f667c1d033.hip | 138 - ...90a0186d8b8004e3f19886c7992c8e04d0e066.hip | 80 - ...9585ba1c10acf67115c5899b3546608541820d.hip | 138 - ...b81407c8a2b3cdc5fecf655b3ad64d5d729cc9.hip | 80 - ...c7910aac798f0555e9e505ad7f177c9fbbd92c.hip | 65 - ...e8cf70c6be969ecfca675782c860b5b75ac089.hip | 138 - ...efed50a89d80c22b2c8c3d5ba67d73c3d0190e.hip | 80 - ...32a2d9701e23dd930119c4ee8089042b5b0ac5.hip | 138 - ...3b2ec99fa7b09c7f78dcc3142a661d686044ac.hip | 138 - ...8a0bb89a6f05289c0405df5126fa0cc16252e7.hip | 138 - ...93c65e5942a2f43f2e491547add02777dd2eee.hip | 73 - ...9bd38b8f9009d932ec49204fdea39a52885246.hip | 138 - ...aeedaa7d50f1741d618fb6c573529eebb075b1.hip | 138 - ...def49859c80c6b3ba18eb2fb4c35c72abc1cf2.hip | 138 - ...ee6b9427c164d78994150305a47f73954a67c0.hip | 138 - ...0e0147a92061d32608a34e7b47bd534eb787fa.hip | 138 - ...13a4c8d169877da6408584dc1f20a6f7c5e3aa.hip | 138 - ...de401aa76cb5425563cbbdb0362748148da3ca.hip | 80 - ...007c36231ccdae12f102eacca1f74b0711b9c6.hip | 138 - ...0a2370f2a320484d8f9f21e3197425c2dbe9ad.hip | 65 - ...1dbc9c433ce8ec33ace9e62550261d613db582.hip | 65 - ...3f4cd28a4c06cc109f6a0798a77844bcc750b7.hip | 138 - ...661b5f30566d1f159f060c264849c7ae4772f1.hip | 80 - ...bacd06455ab20eba78b389462946716b5819f6.hip | 80 - ...f309b923172f4c0fb38d9b9f5325b33b4877c2.hip | 138 - ...f9b9413697d6f4573c6605bff6f58d027c5016.hip | 138 - ...fdaa9266a5a464009297dc59db92504f8bf1a3.hip | 138 - ...0c699d9c3b0ed62097e38ba05e40e815cf474e.hip | 138 - ...588dcb2ef86677ebf84e406eb802e9921d1f1e.hip | 138 - ...bb0bef3b388867e75d7a8a187b8b4b650a42ae.hip | 138 - ...bddf533661642d84bf5a16149692d5a892182a.hip | 138 - ...cb7492feb79e27e0bda73e57ef7dab410e2bb6.hip | 138 - ...d4068ea93fcf4df463e3bf3a6898d23b65da7f.hip | 138 - ...3186dbad604763008e0204a1ea90baecef8877.hip | 80 - ...37f1bc50c4a65dac09ba56b701256b701c4322.hip | 138 - ...a055e5c3d6a953d470db5dc21449766248058a.hip | 73 - ...c24f1f9009e46afa3a59193784cc2575f79056.hip | 138 - ...ceed95b0a0a01f844678717c88e0426fb503fd.hip | 138 - ...32b11429034d96d82c82dbfdb69e460ad8a564.hip | 138 - ...e7df31541c3aa919e9825ad7dc4432f9a03c0c.hip | 138 - ...ff174ff2175e9ec22ac3a0fa59dd7713b79643.hip | 138 - ...11733062ed30b876f1d63bffa642d77e258dd6.hip | 80 - ...207f4b6e7fac27d6c16493a5373f448a2aaae8.hip | 80 - ...41814f76107d74ed069ecec99a248676487eee.hip | 138 - ...d5c8a4988efe60ef7943ecd73e18a28a736583.hip | 80 - ...d60c8abecb3bc9b84b0ea7851628ab17d8b0b3.hip | 138 - ...1691f01cc7f29affb88152dd48c7a484315dcd.hip | 138 - ...1c1fdc4206bb952b2fea675f24e3b09f605eef.hip | 73 - ...3c51948cf8584900807998da14d788039f53b9.hip | 138 - ...5ea67de101135ed5fe04f5cab1ec1d7b3714bb.hip | 80 - ...7fa6780d9e6bde10aec10a875c039fdbbc652e.hip | 138 - ...86cd75411e61a8dbbaf2b916e62f4f5f99104f.hip | 138 - ...d5f2ec83b3331654e37ea0b44d88cd98abaa37.hip | 138 - ...f747525ad31e76c88774fb2208e470da9c2310.hip | 138 - ...221590b90c48d3cf259fb4e834ccfaf7f3209b.hip | 80 - ...4f19363ef26efd36f0436cfa9f84f181a8824c.hip | 138 - ...6eb8c40e3146e06936f3141b2c4d92a578ddec.hip | 80 - ...baaaf1e90a075ab802c6e7d97c4b1605c8bd72.hip | 138 - ...c4ebd1792c781d219bd21b691b575f64635730.hip | 138 - ...d11aad7b666f500f68b264a2fcca6dfc5f1a05.hip | 138 - ...d4630876785655bd4950566e81ae0b645c0d3c.hip | 138 - ...f77aeeafe4b28f314fde5ebccfd2a554872781.hip | 138 - ...fea611f3c253aebf726af3e5fdb7e63e18e13a.hip | 138 - ...1a4425b411596c46c7032f6b83d3152a0e0cd4.hip | 138 - ...3e897098539c3466da9d7a37234daf16476277.hip | 138 - ...52dc38d26f6badb7a9bcb5ce9124d54cc45ed3.hip | 138 - ...5bafb551768855c8c01faa63e44764ebe6c110.hip | 80 - ...5c3549d067464d186a99b8205317cc000d4898.hip | 138 - ...73e3d855d28c54af612ab950b081302891d56d.hip | 80 - ...7768cd725813f8111d265cfdfea7f42034e5e9.hip | 80 - ...7b89d8d625b8244b5cceaa4d3e5fc5a09c8989.hip | 138 - ...8d5ce564c3ae1eefb54e3d41dde2604560ef4a.hip | 138 - ...9ee1f1b44d1a8fbaead65d8449413bb616d15e.hip | 80 - ...b255dde1a9d915e582ee2a83de7d83190c6a24.hip | 80 - ...cf7068183421b141ed5d6e7fe902d06b6492a1.hip | 138 - ...dc02ea7e0908cf0bd48034f5a49debfaa36219.hip | 138 - ...e8e1ab8c63db96843054bb7a98d708ae6a9c44.hip | 138 - ...fe3e8f4add16a088fe44458353fa7c0c4f9658.hip | 138 - ...047b5544acef40e39932672cac6f562e200948.hip | 138 - ...21507cf219fe608715d4e5bb6e5764022e2d61.hip | 138 - ...2b0dfbe3f615b1d164290799b2457437a0044b.hip | 138 - ...4a947a6c2ba83a5b1cb7074aee0bdac6c9c64e.hip | 138 - ...5dfb45658df8f1ae8dc0738ac9614740f2576c.hip | 138 - ...7f5328b035ed59a6f05dfee31edd704c4b07ee.hip | 80 - ...87ddf65ce4ed2997583e20fee9f201e86633b3.hip | 138 - ...f94f5c65c37624f5458c165daf83517d9e3c81.hip | 138 - ...3c44dd85077e6b12dd06fdcf6b11ba349e1866.hip | 138 - ...b9b96edda151072215502cc2b606bf1f6f0b03.hip | 138 - ...47fef2c06ea581b0ab31af1cb0556c572696ad.hip | 138 - ...7963e1969301abfa61d06afc97faea2bb4efb1.hip | 65 - ...86d4bf54b3a4a9e093360998b2059b3c03d072.hip | 138 - ...8a70d526394e254274df95de0727850820326c.hip | 80 - ...99e28aff2fb168cdc3af7132dd7fd09c2e1ced.hip | 138 - ...a4d71b31c451a50df7996e3db864bc3c3882ed.hip | 138 - ...b92b4e249195ac3e0c74d246585a4c9e0992fd.hip | 65 - ...ed7195a9443c84956c3f32839cb3ab9056bdfc.hip | 138 - ...14250fce818584291c69a5f058a58cfbd83df9.hip | 138 - ...3699a5daa14ca2def07489e0b563149bc403f8.hip | 138 - ...af6a7f9e5020e8d0f0ca0f6258001f6ce592c1.hip | 138 - ...cd9f7b08cec83736605af63d9fcaf463a1aea4.hip | 65 - ...df4e13108e043361e9528b71df56f04f696a0c.hip | 138 - ...11dd5ebb989503a1c182684e7f247e2f8cd9c2.hip | 65 - ...236be9da05a07d11cd28034d90cdf89941a172.hip | 138 - ...5e18f6333ed2cce509f07cb8bd5868951d66a0.hip | 138 - ...6785392af35e27d6697b584cb6f17a766d3fee.hip | 138 - ...6bc2762b95d550485aa720edaf71138d94cd07.hip | 138 - ...8da3e6ab050262b659c801ccf9a14787d7f176.hip | 138 - ...96f0ac76f117e66eba97cb990c2350561ec2ab.hip | 138 - ...98bcbe900f8c141136d18c114b02fffbe8bca1.hip | 138 - ...99b2625adffa8215276bb88fc65bae944b846b.hip | 138 - ...cf2f892742b1d236d2b31a8185c6869126adad.hip | 80 - ...3e7c8969027d3316875f33dc50fe022e05ce37.hip | 73 - ...e43f8b629e7039f57b95866d5777273377470d.hip | 138 - ...e746990a2032f0363ad9f9112cc994983f4706.hip | 138 - ...f767e7104cfc8322f26df35907fbf04b8948f3.hip | 138 - ...1b0f85e085dd0769c566fb16aafe5ab5952714.hip | 138 - ...2a2d78176e3f0a78e3ad78217e75a4430c0de5.hip | 138 - ...65ba6dba01da9caa84ba89453b61d81376763f.hip | 80 - ...a3f45d0be2d1119cccd0af042a3e8adeda2ed7.hip | 1965 --- ...bf88db44aa5f884438288a325270d29c7a04b6.hip | 80 - ...c459e57bfed5ec7f40ea4a4dd9f72f3ad7a709.hip | 138 - ...02609fb803ea2697e2c2cef35e6f923d2578cf.hip | 138 - ...0b822743e0205f60521d38d7c64f589fdf0f58.hip | 138 - ...21263e16dafe79b9fe2f998847296e575c14e7.hip | 80 - ...3ef3d5ded0dfe2a0bafb52ea8f841658db35fd.hip | 80 - ...498e418ebbf33bed58b4074d1edf3d9bdd07c5.hip | 138 - ...a23de9604b5d98fe02529075bad995954c12ca.hip | 138 - ...b03461737f1e359f389a8d297476f9b60faabd.hip | 138 - ...c6e599144a093203fd7f92ac6d3c2cd7180d49.hip | 138 - ...e2f97d49f015b9af0b186801e939c6f357a0c4.hip | 138 - ...f893ee660d37fba7eaca452ae65b3e45a73087.hip | 80 - ...22f2d99804198c61251b4629a3f18ed3dcd42e.hip | 138 - ...33ce1fa113b221e5303b4093c2c4e748ce8298.hip | 138 - ...42736d4f677a59a172bd6f162616a437696351.hip | 138 - ...7d7888480b83c78833214b32e10f37a6e20301.hip | 73 - ...9130607a2d24cb0662a47e9cf12c6602143838.hip | 138 - ...943fcc2e64c618fc1415b3f1a0db4d70aa8494.hip | 138 - ...daf9d4270d2ac61c299320e06ba73f44730364.hip | 80 - ...0cad6ad5b172e51c569e84cd54a19b4eb0ed05.hip | 138 - ...13a6d0f8c798c0c4ba4ad202d081899fe081ab.hip | 138 - ...6bc5faf18be193212217788d476ce6fd384bfb.hip | 138 - ...7faa0b33a9aada86f032174afd40d18efa7715.hip | 138 - ...81f8cce0d77dec9f977b9eeb0778b70a13fa75.hip | 138 - ...cdcb750f382fc7828a9886585f50efbe5be735.hip | 80 - ...d9fa7c2e13d0bad5fddb2b5a316bbc09d397ea.hip | 80 - ...da1c96568eab89a8f6498f8bb23c1223cdc7b0.hip | 80 - ...05aca3520b171bb82d10ad70fef44f28c19776.hip | 80 - ...4a573ce6b7d2f90aede543939315561cc43177.hip | 80 - ...588bcac681a5d69f252d7523a3681a0c6b6181.hip | 138 - ...81430c92864c29bb9f409e7c27caee1de00749.hip | 138 - ...d5c3c86398f6ce55abc90db3e362dbf9f457f2.hip | 80 - ...f7ea0aabd069362ba4bbd66623cea5b6e1a6bd.hip | 138 - ...0ef512b7862837f54acbc3b21e135a192647a3.hip | 80 - ...22c973581930ab7a4ebc90b3bf1cdaa229a87f.hip | 138 - ...411df58165946bf02942b597d94de7dd856987.hip | 138 - ...6806a4598c885e517e664fc8280c59ec3cbf11.hip | 138 - ...73b7c710d418f44dc2b41bec5905024334eae5.hip | 80 - ...77d95cdf45f6fec95d1812f2ef183a75259e38.hip | 138 - ...828c7d3f5574690f12f841c27f025206e6165b.hip | 138 - ...84fba2eec5899bb40d49d4508196e6be1ec1b1.hip | 138 - ...e235e31d6955393ac8e825bd69ead70687b7c8.hip | 138 - ...f860d42fdc2cc6bd743d53ba546e332c22fedf.hip | 138 - ...105635385fbfb5d2f330df83ba6747bcb27f6d.hip | 73 - ...4f9af5e5ca519b21b71a54acb49f50b4999c47.hip | 80 - ...511de2592b6e350737e44865e1fed6496e3f32.hip | 73 - ...632f996eb63fbe4bc5748c5897b775087446a0.hip | 138 - ...6662cf1c9900a4334d2cadcc5f5ac3ad355f05.hip | 138 - ...73457ac3be01cc1595a015a5f598f8290c77e4.hip | 80 - ...a07ecf1a59f72ec6bef3e970d7f33cf54c5f44.hip | 138 - ...c142d869ef940ca876c93033ad53b576ed34f2.hip | 138 - ...047ea90076e3b0a3eb0586d49b9ee74ca6d279.hip | 73 - ...0861e81e5acc523fa680534eed757b7b4a4e1d.hip | 138 - ...2f61bf31dbb5de5d7039d5ff2338068a759b68.hip | 80 - ...3132e712eba8972ba444c604f89e01c5b84cc0.hip | 73 - ...5bf652702c2976551778b9159e09188575c63c.hip | 138 - ...6b3eef02b904304348b9d35f715b639d63218f.hip | 138 - ...8e4c1ca112afec494fbe47a85b553302c43395.hip | 138 - ...914c00690ac5c4f89cdbbaf00732ba66c5c0ef.hip | 80 - ...c9b46da8774462de8c24e14b12df3ed596eb57.hip | 65 - ...2013527a0266ad479715ee3e6ae01c45de29d0.hip | 138 - ...410fd9a4150c33186a2a365d06d8f6ea621c20.hip | 80 - ...5d90000b55ab8b6055b1934880fc6c4870b34b.hip | 138 - ...643917fc970c043d1c80d8d4b17ec92deeb8a1.hip | 138 - ...9668a3212cd00edaae871758be30a5a1fea589.hip | 138 - ...9e6b93baae25dff97a0bc9145a8d328ed3f317.hip | 138 - ...43da478310245e19e6c6a0d9ed7ad99540b3bc.hip | 138 - ...6ef175029a43e64164176d4eb212baf9d27bb9.hip | 138 - ...8d747083272ea657604ac84867ecea17bd65da.hip | 138 - ...938733446b6c0dcd159719f08d04a9aa467967.hip | 138 - ...b3225da1e1842f83592971a1f62a0fe30aa9d3.hip | 80 - ...60282ad39ef034fecbdb74acedfb48620b7dfd.hip | 80 - ...835ba70606c769e56d19dbfe74061361aa855e.hip | 138 - ...95783ae8f0034692efd6563f789ef03fd0f4f3.hip | 80 - ...d77b228420a3ead919474ec9c6fb2800f86890.hip | 138 - ...ea90eb5a527434c1740933a1d2dd863eccf14c.hip | 138 - ...f90358e522d7bb7c76c3a2c6010f0f38788bb6.hip | 80 - ...03018e71d57d3266fc35e2e18a78faa3dd52ce.hip | 138 - ...8639d44a4a8372a627a7c31e9527c8faa26f97.hip | 65 - ...c2000d32c230a57a6712f27bc0fba02722f5fd.hip | 138 - ...0bfced8745fbd9266207463fb41476dc23afff.hip | 138 - ...1d897ad17d7f6db2741b396e6b85a9b8f35286.hip | 80 - ...5e61dad8f63fb973cb2eb899c959e400622652.hip | 138 - ...8458c5a0720ef152848713119ebce6d76db6d6.hip | 138 - ...9071756e7d0582eb61ce6483fa3c988d2e10b5.hip | 138 - ...e4d2c757e4b8c366a2c320360e21ff0ef671a8.hip | 138 - ...f1ef32c4384ec26f3dc5e3af6a74fc8cebae92.hip | 138 - ...f2e2b108a53308a0cb6c123c8d318cbc2eadb4.hip | 138 - ...f7634d29bef11fd466b452a46b0612f38c949b.hip | 138 - ...0c484c2a366258941ee0051e139ea716a9de2f.hip | 138 - ...1a8bdf9d63b112e7fe5fa7e8835a6789cb8ecf.hip | 80 - ...2454f2d82184ab0491ea0675750c6ec55d659c.hip | 138 - ...2b4f995d622826af5d1f2bffa7ba68467c841a.hip | 65 - ...5a523f815eb822d66162d4feb75fe0bc50b648.hip | 65 - ...6c5836ba118969c4ba89ed62a98dffe3105738.hip | 73 - ...95d39cd62f20622a31f11a292ed175abb5fdf9.hip | 73 - ...bffc159b0bb826ba489ae763dae141bfe8e802.hip | 138 - ...c9e5384809b21f39e78bb2e43af345a9a21d19.hip | 138 - ...fe68ba10b3480dddc9866c51ca8b5efe962cc3.hip | 80 - ...3a980a26682d879c3a3425f3ba5be3f5761adf.hip | 73 - ...45129fc4995abcb8f880692f11c6186fc01641.hip | 138 - ...833fc01e88bd8e256ef64ae8251dd0ed10720b.hip | 80 - ...97c457144cb63a9c6c3d6be613b47bd0df9928.hip | 138 - ...d492377add5c8f6d0d2dbf9ee9e4338bbd9f1f.hip | 138 - ...e344010d49f7f9a6caab2cb84be7f87d2d96bf.hip | 138 - ...f6c5be53732eb1939a2f93232af7dc011dec1a.hip | 138 - ...0bcb241e5a1be1d35366461408d06e095a26ef.hip | 80 - ...3326e055da32cc979892a2fbd0f7b003cb9f98.hip | 73 - ...3af90387f1d227119c5dcd4b71362940bbce52.hip | 138 - ...4050988e5790a28dbe10b4c20e14f10f6cf85c.hip | 138 - ...49a9b0801a06dd89c7f7182d7590b515df1592.hip | 80 - ...50073f6dfeb7ea77d5dce288a1d2f08f8f6362.hip | 80 - ...5317b6cde327a842170ebff20c2b03d81379ff.hip | 80 - ...8169ce4b4b9a17ac96fbb232e6a93f22071ab4.hip | 138 - ...823c3b99e7c8d1cdc39a5dbc7365a383bf9ccb.hip | 138 - ...a934408c75da5479cc41f96b98ea7d333635ea.hip | 80 - ...b6da1095bd8669c0e48b5cd808cf0dcefa2674.hip | 65 - ...0bda0feaade2b554d648d72f219ac9c389bf09.hip | 138 - ...2e75e6f659a500dd3cf2cfd65118f111342119.hip | 138 - ...77bd7e89ed832cc31b2995566a49bec6e4cb52.hip | 138 - ...7aede7762a524a7a424cc4dc46e43fdedf73a2.hip | 80 - ...808da5c2514806c2953bb77d5692e5d7c97aa3.hip | 138 - ...82e3c4e445e1e02f14435e4ca01a90850139a4.hip | 138 - ...9756060ac0e73dbcfc58a9222a78f0283cd029.hip | 138 - ...aba3ab83239e474412fcf89fe0fbef97e51bf1.hip | 138 - ...f351fc2c2da4a8e1760a3affc9a5947c6b3bda.hip | 80 - ...06f77a4054ca615d96636c0e2eba2a89850142.hip | 65 - ...1f2d1e57095f756ddd11e8e9d4f6f253e3ffa3.hip | 138 - ...23a26e0a59a8323dd97632e610d24624143fbe.hip | 80 - ...43460c011b8d5e01ea98c9b8ddce962de59a96.hip | 73 - ...446754d7000673779d15d3e73039fd3c10a720.hip | 80 - ...7b637e0313cb423b22cd8844cc2997b3ff73e4.hip | 138 - ...9a04b7f41dd6f0db017157a44790f35c626e2d.hip | 80 - ...9c659ba43bb907fd4e3e36a50958288bafd1a3.hip | 138 - ...a2b905c4ce32234c2af62328adae6b1f9217a8.hip | 138 - ...b33b5442d2e0948762b1f2147a321a9d6907be.hip | 138 - ...fac5a83def98340c8786d55a30a98ad68b9eed.hip | 138 - ...30f50071113dc4ab59468d568ac9deb06b0342.hip | 138 - ...43e401abbfb1b6737e4dc822f68421abbc648a.hip | 138 - ...8b4260626beeac76c26dbcee3cba1457b30e99.hip | 138 - ...a394a09c8691a534ad2219bedf73724b6dd5ce.hip | 138 - ...ba937ff6d0302ab013db7349d4feb914107f1f.hip | 138 - ...0247e301a7b076b6ec8a778c3b47e330638963.hip | 138 - ...32f2d658f1f69840fbad511ce8a3851c859d52.hip | 80 - ...55a23a0f24ff7062a4c286944f25d2db3e20a4.hip | 138 - ...024440e780fdf9ec94deccc85216d8bbb5788a.hip | 138 - ...3b7b04496e4db7c1ba2436485dc7c8a4c88448.hip | 138 - ...76a6de0e2612279e0ed64612f7393856bcc9ac.hip | 138 - ...c8e4d5c761fda50e010da779e8e4730051d403.hip | 138 - ...f0200092b0e18d57a9f5e512d565f1c0229436.hip | 138 - ...08502fd29d3a24b32177bcea968121ee809115.hip | 80 - ...10540b50e95e99a5cccebe47d9d3a83093c2fb.hip | 80 - ...1104394c8bef8d4ecff35c1409221e723a5a8a.hip | 80 - ...1731442b756308c0a869f21b7b8b103aa613e8.hip | 138 - ...222e158484773d2257f4a31e3dfbdb68336a8e.hip | 138 - ...63272d25bc2db2ffaa1fea87648b45ee68d408.hip | 138 - ...9df310195191895005b30151da8c1afab6c82f.hip | 138 - ...a968898f0bc6366313e41eddb5e3a3ed12dc98.hip | 80 - ...b807c48c472e9b1311a6037cd98e21d6706889.hip | 138 - ...c3760f5978baf9780ce4587ae4c768af0e49d1.hip | 138 - ...c4b866692ba5c3d115482bef4790733863c1fc.hip | 138 - ...06cc121ce8955ed59ea3b12b858ee2e0cf82f8.hip | 138 - ...0a6196b662a1d3dc7441a9536d825dc356b95d.hip | 138 - ...1500dd4c41e4d68834814a48a639f5ca36a2fb.hip | 138 - ...2a86568f89a5a5a165cfffbae9ca6949f2477e.hip | 138 - ...438250078ba2a47345ec4955dafb4e4de78a25.hip | 138 - ...527660fa7aeb9a951a9f2fc3c53989bd141c48.hip | 80 - ...5fbcb9e503e68fafea08abf86a4951f440850f.hip | 138 - ...652a27e8605cef59c8341813b68e7513be23c5.hip | 80 - ...7e27892bc57f3dec0da24f94f2a483d6c9321b.hip | 73 - ...8a311bafd1c153525393b252e4170f8aafb370.hip | 138 - ...099fcfc218ffdf69edb4f2f0e46121bea9fafc.hip | 138 - ...746071156e9ad46f403a539dc237e0a44122a7.hip | 65 - ...e7c1e5f41a451c7baff54f7238b220f1bdf8a1.hip | 138 - ...00f0af03743dce328486f8fc805dd30bd6da31.hip | 138 - ...08103188e27b3bc55dce0c1716c0b4d32d6494.hip | 138 - ...2d29c85070f488a14b1915f948e5fd69019c99.hip | 138 - ...4932e2655d7b32704be8de9a63bbd8c3369f02.hip | 73 - ...5a939a2491166dc520e9a2b9de7e43671e0c2b.hip | 65 - ...5ea796c8d97bfe3b7c9663bf15e2e5e7696235.hip | 138 - ...807a8e90bf1cd839f32fd718afa6469c35a4fa.hip | 138 - ...9241529745bf138552f49d9a93db418663ad65.hip | 138 - ...c2db98d8e2e690f499f41cfd5afb831b756f54.hip | 80 - ...11c54e6a6f9eec378d8b661121066536195d3a.hip | 138 - ...1425a006aeeff4d69c8570cb6bf1e1427d2c21.hip | 80 - ...4121d3bad1d448bd413718fa096f54faa12e95.hip | 138 - ...6f83cb96d0313abcdb24955edd4264df72aed7.hip | 80 - ...7f7e626135cc9176a295f3d1f336a7c3852688.hip | 138 - ...8399e756ed5026baf3ab78af17489dc07b9532.hip | 138 - ...8d28c958c0a831a615a4811d13279b18db09c4.hip | 138 - ...42b78913a853a62dbff8b99d9ae3fa458f461d.hip | 138 - ...6662dccf2f650bcd8123c49006c759cd4c0ef6.hip | 80 - ...7e58867c46d96c9bbaa96eaaa9f93595c9e099.hip | 80 - ...a0a960541bd8a2dc6741579de685b7c0a5f6d7.hip | 138 - ...7b70f54cb2778b5ce3df936b477f775eea8b3c.hip | 73 - ...8759ae25465c32960487375828e23c5f1ac869.hip | 138 - ...8bf438642e5d863e31145ada2a0688059aa5d9.hip | 138 - ...ad61bf8427a26775969f8a9166fd0bfb7446b4.hip | 138 - ...fe04467e87ec2110f60c7aea0cc9bf2ca07481.hip | 138 - ...010c9bf7341588f071f889b7a0b4dcc4e7a14c.hip | 138 - ...1b29d9888365bff0f109d897b508eebfd8a61f.hip | 138 - ...24e97d5ecba46e06d5ec1a9456c810d80227a3.hip | 138 - ...273a2f8e6bbb42ba0b0871b6c95abb34531f33.hip | 73 - ...a5ff72f22e0ad040a281e66b1aca0bf3a2aadb.hip | 138 - ...abcbeaa4d33d3150f2b0238bb62ebbfe960980.hip | 138 - ...b94d76503e13c911781169fbc378517332c42e.hip | 138 - ...bb367362fe2c4849ded728ec5dd00969ce188f.hip | 138 - ...e12dad9e3bafe177ed3c27c833825813e18fc3.hip | 138 - ...f8a89468cf9c8606cf12a930db062a83cd0ea0.hip | 65 - ...37d9dfb68351de2942e32f35e2ca1ce71edfa8.hip | 138 - ...422621a00ff79b2f5ec0dafb957c77693537b3.hip | 80 - ...67a8807c9451b09227c0f685c18aafeb062fd2.hip | 138 - ...92d5df4ba2e999caf6889a852db4e1ba078e65.hip | 80 - ...d3071347a0c98f3221104036f477aa13bffa4d.hip | 138 - ...1dca5feb864e8981387c2d07e62acef1730aa8.hip | 138 - ...2280997eb6f1d091094fc54cecf42b7c9c3a2d.hip | 138 - ...2643099365d0903c799585f41dc1a525ac9f9e.hip | 138 - ...6b9566559ed2b1c85f2bea1c55e72c41dc47bd.hip | 138 - ...f86f458fb4dfcceb7db3357fbae0dc15142a15.hip | 138 - ...fbb5ac9048a962a60f48886728220ae6c2aeaf.hip | 73 - ...26eafe76cca8e74e819220b6de1f4279d48e43.hip | 80 - ...4ecb47f9ebe8c2784976c3e9bbe4834b475cf1.hip | 138 - ...508b92f7e123b21658f6e17d624ffa87831fee.hip | 138 - ...5b3c218e4a7b459e54080e24c5b730221eac02.hip | 138 - ...b129e6dee6848043dd0e8fa812ae80fec4d014.hip | 138 - ...b3b682eab96e4e173affad75b9d8e73f1dd690.hip | 138 - ...e7cea6df8e6dd56194e1172f28943667f1c4ef.hip | 80 - ...ed3aaf24c73073c604a3b23bb4b0358b8e3490.hip | 80 - ...1454ffc1418dac641f63671e947d9f550b1f0c.hip | 138 - ...38bb80e9880335faaea81985ed5d0e713ecb08.hip | 138 - ...3b7e4b8c1efe59f79a15512716fce2282a79a7.hip | 65 - ...64c33870ebc329921cfa3867d58b1857421f65.hip | 138 - ...b0cee09d633b6f70febbba63a1e090522cfb4a.hip | 138 - ...ce3baac1e3ca03af0c3f4ee4d0158ad1031e9f.hip | 80 - ...cf0a9d5a5451da5dbf6075ccea45e4a140550a.hip | 138 - ...d7a9ca49c1149d46f6b05b0fefc41ecaeb6ea1.hip | 138 - ...f45927b6d931e31e2209685d787efa28eed8ba.hip | 138 - ...1cea88a2277b87d405025ba256272a1720f88d.hip | 138 - ...289100991d4c8c362f64c8f6c4ba395c2f3495.hip | 138 - ...3f3eb2f5eb1f3287879604892b1c230df85f1d.hip | 80 - ...45624dc6e33c477c73a155500b015b6c010de8.hip | 65 - ...55cb42b0096a8ae338ce100f86e378aa1a04c9.hip | 138 - ...a8c31f6d5bcaacfa4a21aed4d1d3caecb48922.hip | 138 - ...ba3cd44f78c950fe7ceaa5f0629dfc607b30f1.hip | 138 - ...ff884e176ec7cff86d17c6afe1ddaa4dd6007d.hip | 138 - ...143d88eaa0d9cfea856b2f3a57d1275a656627.hip | 138 - ...2557f206fd81d82a3b9d59113105040beb891f.hip | 138 - ...562e6c3af28b8478020ce3c3bf73c036001c93.hip | 138 - ...61b019e1398a6a3c36143fb84b5ff22c9f4508.hip | 138 - ...839660557dee9d5bcda9b56940ce23236c5f6d.hip | 138 - ...b2ea922daabbba131b90713e06d8caf5f30662.hip | 73 - ...cf565a5a1c4a09887c67ac3b9a019dca427ac0.hip | 80 - ...34433b784d1e405ade3378918641372a30bf6b.hip | 80 - ...5e01b4f2ca8ea10898c39d6570bd74e85f46ed.hip | 138 - ...7315955f555768f24585a50d75e216c40f062d.hip | 138 - ...ad30ff0739ab5dede67a96e859f8c474c245f8.hip | 65 - ...cc6893456a559c7d22714116022fc69b372266.hip | 80 - ...18b1fcee808b6cccd131418b6ae9e8bf900d8f.hip | 138 - ...18f690b6322588041bb467beabd8a7bc79a2e0.hip | 73 - ...357c5e9739eae136a7abf92bc38d3ac94753f8.hip | 138 - ...52ca6a3ec02f6559e4bbf1edde42ad2d127c26.hip | 138 - ...5e7efa263223148318ae96bd1929b382e994e1.hip | 138 - ...aa64439b80ff8dd12498b3e5f6b625da16e285.hip | 138 - ...db688a9189e1c47c300d474df946a248a63303.hip | 138 - ...18e3ab290263ed2576feaf22a1944bf2ddcb7a.hip | 138 - ...5b183c50dd2663dabe3eb8b780913b778c54ab.hip | 138 - ...60f6b6d0869740a5a411abd80108f729f810eb.hip | 138 - ...7b1cb14b67dc82f614831550f7deb0895bd7e4.hip | 138 - ...9461cdb5687ebbb7bf0be136071d70420c1619.hip | 80 - ...b68458076e6cb129d3ec793e95b91430a0c8a1.hip | 80 - ...db3f29d1940e59dadc357c040ea37a6ff208d9.hip | 138 - ...17a48a1677bd26cd48e512f1fc8830a8a551b8.hip | 80 - ...8ce4e14cf94b284ffa735fe03d923cc74c9fe0.hip | 80 - ...9b82a27571ac91e3631cbdb7e0a58155abf962.hip | 73 - ...e2326066c91452335eac05f25a6311376bd9e5.hip | 138 - ...06c6c37cf472ad262f53941611b5e60072bdf6.hip | 138 - ...47e039c003489dd528faf5d710e687321a3fd7.hip | 138 - ...56b3a2ff49f72b91a6b9c215df285f2798ad47.hip | 138 - ...77ac04be3a6cbdbfbe57612a469412812fb5b5.hip | 138 - ...8e3565f4c720e6c9691b0d33c1392936e2e7ae.hip | 138 - ...95d3c96b3f4556b9765fd0a3b5701b2fb10948.hip | 73 - ...e7c78e8f65be35e2753a0ad5123118555c56b2.hip | 138 - ...f2156a04b18bab55af60e9357f28d8a4604e8e.hip | 138 - ...09f2a7deb027e864afdfc9975d3ab93c5dcc9a.hip | 138 - ...32c5214c4d40c54ca2d02f0d4785c6d6902370.hip | 138 - ...462715ed5f192532760d6f4c66ff9d4e20e254.hip | 138 - ...564dddf8b492d80be54854abb8d1d831e42679.hip | 138 - ...5cd8fa559588f4264ce6192f2de3e3065365ea.hip | 80 - ...5e28a8a51cd435130ded2abc9fc606e522c713.hip | 80 - ...62b192a64efb60d5484798526278ac7a0fb9fa.hip | 138 - ...66b6c6b2ec3acb40ac1cda432efa1e4e62d9d9.hip | 138 - ...690e48f30657b0fcfa26fb3b9af3ef76e792e3.hip | 138 - ...c181996532676f2140fd026707135144e9d37b.hip | 138 - ...cc95831c347212021c0bab7b43acd7daabce42.hip | 138 - ...d82b58fdc3e5b7a7c20490ce7f5acce4e6ec79.hip | 138 - ...1fbbdc2dcf2ec81efce34673ee6c425cc16ca2.hip | 138 - ...68af1b2f104664fd05d21ad789aed39ecfa42b.hip | 138 - ...7eaffbff3c58183a656687010daa2c16cfc26e.hip | 73 - ...8d708d13577f2b92e6d5adfe952a87e0cf7be5.hip | 138 - ...9c8fb6028991321b09a990c2188d854d940268.hip | 138 - ...9ea3713aef9b916e1b38a882a45012930924d3.hip | 138 - ...b9871c220c0065d74bffeed4021d0304a9625c.hip | 80 - ...f4363f50af1e7ccd24751d5f5b181bf32c604f.hip | 138 - ...01680af41c8738089ff377147e0547dcad114d.hip | 138 - ...1737a13e24009bf1a5a4b780175043a9f2e33e.hip | 138 - ...66db0ff7b035e54f2c0e59acedc2131b722a55.hip | 138 - ...8a5f057fd5cef2df5f919f5102f47e86901e3b.hip | 138 - ...4fe2d739eca8c93fdcb2c105d4154cee6ca1c1.hip | 138 - ...548aa042c69bb9c59a8bf706b44028aaa41830.hip | 138 - ...f3ced9b5ddb0dfee8ed5e7df8eca0bbe273047.hip | 80 - ...fe73f04cef91cd2a0682e905483968ff80eadb.hip | 138 - ...1415463f0316ebe25ff2fda47c68cc54db3359.hip | 80 - ...24e1f8cda50f80988857611da766685da94494.hip | 138 - ...280c91d7cd8712fd533e246a6b0f758834abc9.hip | 138 - ...2e34930d11ff493007b1613993e01acc1af78d.hip | 138 - ...300e0aeabe337785d4c7b41796ce65df6cc42a.hip | 80 - ...3eaea4096c8f5bee16a64860432f0634a253d8.hip | 80 - ...435e5dd23e49e19dd313f9891ffec800ce74c2.hip | 80 - ...6f6c7c7655c34b7b9973ff357b0813f0a3fd7c.hip | 138 - ...7724686efd35731e5335efa949486c93ae26e3.hip | 138 - ...9e7be0f85656d012a6451b65f6c1d2613b187d.hip | 80 - ...ae3af78583258c4b13c11a442022e0e058bb85.hip | 138 - ...d7d145f96aa8958a9208d0c8887742a8c834fd.hip | 80 - ...e9e858abf6f77489f3fadc4ee81edacd26705a.hip | 138 - ...04c5910a2d0595b39a3f87652a9d1ef4fcbe80.hip | 138 - ...0a68220a7b621ae9817d7b77f55de239b0a4f3.hip | 73 - ...11bdd71351610d55916d452495e599960d0a41.hip | 80 - ...2fbc418e829f89bcb8d93f8afd2869dd8dfccc.hip | 73 - ...d4c005d723cdab9fbc307933c1257d114b539e.hip | 138 - ...f5017cc0f5c8c8dc71492e7765cf729c1f225c.hip | 138 - ...06b5b153ea6e8b1e20d9aad9d4633333fd98f5.hip | 138 - ...2e6b05e7e4de2cb23d815f8b2c8adf22131c0c.hip | 138 - ...4a00bd6ea27ff20a2903d619e1361b5e27672a.hip | 80 - ...5dbf601de5754c03a03a1a42395dc0766fb8ac.hip | 138 - ...9f3da698a6103caf25d785928dd9f814ac27b4.hip | 138 - ...b5d6e8fbfd92e9f7e47bda5cfbb0d4162a6319.hip | 80 - ...fd02981f92fbef6277c1985cc479c12bae9239.hip | 138 - ...1eaca3c37a82d19f8dc91f06764170069ca3af.hip | 138 - ...2e7f96b095ebfb66ecc7a75752fba2a63e4f37.hip | 138 - ...30f472f00bec9da0564ddc40e07112b5f9a117.hip | 138 - ...45948f2795293e72530b02669c4f549608ea7f.hip | 138 - ...4c03c916393d6be7c5181369ebcef949eaa763.hip | 138 - ...68e4d00295b294320b94bc777d7d34609127e0.hip | 65 - ...7393d55600c9892558248f4131fc06a6cf3309.hip | 80 - ...74439f42140cdda9bb0f78d995d741212a35f4.hip | 138 - ...76e5dce9af523422782dd25d8dcf6f25edc68f.hip | 80 - ...af664bfdf070362bcc91af77d1bc406f744351.hip | 138 - ...c48576f285325345fa1205e5e7e01787b74f71.hip | 138 - ...d4d46397a3749646b232b306688e52b8c6e584.hip | 138 - ...e4a98f150f3f9ab6f03b5fd0968c5454565c9a.hip | 138 - ...eca56234ff6fb4f23b9b24822887fd9a3d0df9.hip | 80 - ...ef4d120e71bfcfe61d67aa44d24ceb907c2b9e.hip | 65 - ...0c50a1fac82d47dff2357ee3ddbfa0b2c8d487.hip | 73 - ...69d06e3f32e3b6d28d3e54ad764b472741c193.hip | 138 - ...8720923c3452e3aebd7b9c1b4b23f0c35d7e4f.hip | 80 - ...abdafad0bf803223ba5e8f474cd59233dc48cb.hip | 73 - ...b1861e31df98bdfd731efc3d335055090d83af.hip | 138 - ...d3de43cc1f7588d62a10362f59d113ee818846.hip | 80 - ...e03571f1d2779bdeaf0a6a2d617e236d191c11.hip | 138 - ...e671f5defd76ca08614a7a1f184c36c0f1e2ab.hip | 80 - ...3b1ae63e127b6e6afe39e354d4995afc5faeaf.hip | 138 - ...5f3cf0f78f73df79665c26b20b0805615e1b04.hip | 138 - ...65e58c9f147498ed04dd51fe1393770603a6d3.hip | 138 - ...7dc0f356b630179916f8fc2041b7f1402b46df.hip | 138 - ...a9e9b7277bc90518ab92860bef2097ba96d982.hip | 80 - ...b2e63cfebcf84043f79be0321708cd159c62b9.hip | 138 - ...bdd9c3f496a27bde68cf86374999ff2dd53505.hip | 80 - ...c87b7d385e7b092e4706c464217b004fd8a6a4.hip | 138 - ...de56efe17f4fd36a11cc959320a5e43f1dc232.hip | 138 - ...0a88ccef04e81b8c684b695f7cb4310e448915.hip | 138 - ...15e4f16de26068cba30ef12fc29332d45e460e.hip | 138 - ...47f8fa40332c6ed12d9971e0b539049a871c34.hip | 80 - ...760de14b71a41882ec4a2c7362565af36d1a5d.hip | 138 - ...79dce18e49ffe024fe4cd0693ad3399f5edaee.hip | 138 - ...9a933b916285d9580a76df543cfafc88a536cb.hip | 138 - ...c2075f394acfb14fae7b1ef4304fd9b654ba0d.hip | 138 - ...d6da5357b67cc28aee4afa9523adaf055c4e32.hip | 138 - ...f35d82ceb4af2e07719c16109c6d72eaedce67.hip | 138 - ...0aded9d1baec3125ce8e176248cb146ca580fa.hip | 138 - ...1e1c969b57659e7e1367ac9ba10ed5ef5b69a9.hip | 73 - ...44435491aa68acb3217b0e693232c67641a2db.hip | 138 - ...4a5d56721bb1a1332a65882132a8c5763932ec.hip | 138 - ...6243c6850c0a2d2b7bf1476e12f95f187257b6.hip | 138 - ...a4d21931b9afcbd70b1567995d3eeb6f9308aa.hip | 80 - ...a883a36a76edb276a66c5d779294f170d6d4b7.hip | 80 - ...d34faa8b168e2ac7862641229e6146d3e28aee.hip | 138 - ...e530cbf6363a8f08a94728e45e88ecde299e7b.hip | 138 - ...f20bafbf156fe8fb80bdd84a5d2f3a4a944c1a.hip | 138 - ...1dcf3213efd214cc2ce8c9ba0027f991d241b4.hip | 80 - ...52b2318dbb78b1a82ef03666a35a623f44481b.hip | 138 - ...93976cb7b32a8bd28ce92fc13af00a3e21f737.hip | 80 - ...e59bd079f4d205b613056f975fd2b4e372ab10.hip | 80 - ...e7b11019fc2299d70869253877319b03388244.hip | 80 - ...f887556a3540609649744957651ca667b91774.hip | 65 - ...f915b4d9bd18a3c25a85917392ea4a5e88b349.hip | 138 - ...5128c6978449b33ce0c35b02a9e9aaad65ef7a.hip | 138 - ...2a2a9435103ed405dc1500d31652f1d431a49d.hip | 80 - ...3e5bf45ec5008aa3aba4773e68a78e122b2fe7.hip | 80 - ...688999141a72e61322140db29043ef9f7fbc3d.hip | 80 - ...6c89b7a04758b4badbf9695b316f877b8bb053.hip | 138 - ...8db08068589c6e4c096054d26a2e5be63285b6.hip | 80 - ...a89981a05963efcea7ba5c1e967638beeebbbb.hip | 80 - ...a8a323414448c50571a334f29bc0a38919b61d.hip | 80 - ...2a6ffd8a21d3e98342fd401f0247f62ca4e038.hip | 80 - ...44427df3ae9392c4fc4c25c232196828e70648.hip | 138 - ...82a30dcf702daae19bd6705864bfe36e09502c.hip | 80 - ...bd60bd2afee49b30a583c32a45ae9f2076db08.hip | 65 - ...03eec1cdd216d5c4a7ba977e2ef92a0d7fcc8b.hip | 138 - ...0bd57333c6839ccf5cf2e928edb996bc60c371.hip | 138 - ...1874a7633e5713720b9d084b6d1c6715a51a17.hip | 80 - ...208a6e8c5263e38f9ffcb062564ab61d2785ff.hip | 80 - ...35b4651a90e331fcdcf224282457e3dc038a30.hip | 80 - ...402a22ceee3b665a3f24edb98b8398c35c6f5a.hip | 138 - ...548ad36fb92d0963893146c8db20f53cbf0c8f.hip | 138 - ...67aea26852aa9a9e3dae76b906005ddf6fbae1.hip | 138 - ...8b347672451e8391388a400d016803f4c4cf8d.hip | 138 - ...940ce53998becf9bddf56df7d19894a7658168.hip | 80 - ...9b6956eaf678f7eb901567d1a515eddbedae5f.hip | 138 - ...b6e18b10d529eb6b32d7c19c59eaefc7184376.hip | 138 - ...ff49018f1c12b9fa31e523ad40b9cc162ba34d.hip | 80 - ...5ba79201a585bc091ccfc326fd24e851d1eecc.hip | 138 - ...6cd05288e1666f5c67fb87ad02ce660e4c589c.hip | 138 - ...b14cf2998a61611d1de2594e926fcdc378999c.hip | 138 - ...bd9c4f1b7a0621c67f3e964d946ce22fb2fc80.hip | 138 - ...bf8444c1c26b91fd490c7216f4d0f8aa0a1f1a.hip | 138 - ...cda610c235987e13232e828f8d86fa88030560.hip | 80 - ...ea83a47c6299fefa4220ed88f7a8e1dd938215.hip | 138 - ...6b4782793c6526bfce7362efbf6bf069928b2b.hip | 80 - ...6e26d4969bc6bbe9b092bedab11cddb3360c0f.hip | 138 - ...964a17f902257aca9d08c736516a2c67d9a0e9.hip | 138 - ...cc4399c5567a9495f17d54c712cc9e65e57521.hip | 138 - ...de9a7dfb1201b56528740e9d8a07b62710fcaf.hip | 138 - ...ffe9e21362afe9c3a407c09d5de186954931a6.hip | 80 - ...24d91c1fd6290a6cf8d52a3801ac6b921dc7d4.hip | 138 - ...2e68bd619e118292768f0925ccf92cbfa68415.hip | 138 - ...32094f5917e9164ee0f973ac6ec47245a69101.hip | 80 - ...89f267d34c9961ced63ad07ffea2c6d2911415.hip | 138 - ...54f09511778dd1779a839b0b194896070f69ad.hip | 138 - ...679919fcd292a2a69543de0db94e2985c9d364.hip | 80 - ...762476c7f2bb05dce92ec22c0acbeb03676746.hip | 138 - ...7fc33d02b1932235b8d152e57559060211d591.hip | 138 - ...a784fb478ff5b3f1e2da9765a3a777efda92e3.hip | 80 - ...a7ab44bbd9fbc97c7805860d5f6ac81d6ae468.hip | 65 - ...eb2edc7738d8d18ac359691da261ceaaf71788.hip | 138 - ...19133d2ed892745013b2fc5d503414cf0a4d83.hip | 14395 ---------------- ...39e6610e41aff8d1ccdb66d9e84d3e48e8d379.hip | 138 - ...4929c433b049a8cf949ff476309a8faf5c25fb.hip | 138 - ...7a0276ec419f18f060a5186e6bb703ae434ac8.hip | 138 - ...901147b7188212b8d8feea15831a11425fe4b3.hip | 138 - ...beb9cb4e161f9dcff79080149076488d436301.hip | 138 - ...d366421e0b51c90fa53c366d47ed8d51b3a329.hip | 138 - ...05b4e7782bd0e29ca9f6d33fc59d4304136d41.hip | 138 - ...216f777feec4752f5882677b18168225da4b53.hip | 73 - ...29b93cee012c79d4364502f1d90f947c73641d.hip | 138 - ...85ae0a16e4b293b549bcb6a3ee52df7fccca32.hip | 73 - ...ba1183efe205af38e79a1b2dccea5fa515d02e.hip | 138 - ...ce1c9b00f160a17355d4583d49c47887ac33c8.hip | 138 - ...f96b404feac271dac8f4190180754480d3ba80.hip | 138 - ...413bdc825ae863d53dab548f2145dc0de8fd37.hip | 65 - ...55946ff3c15a44b9c741e9f6bbbcb5bd4c8577.hip | 138 - ...7a4ea3bb8905a22ae97a94c354b1cbe38093bb.hip | 138 - ...a578c0e7abf1127dd0370f06d7278656c93ab9.hip | 65 - ...c803342862aa30e23e5be7d84e611bc571c529.hip | 80 - ...e9ed84ad9be1627db7a66af9370679816c0897.hip | 138 - ...ead6be6e39ece0e5d44335083336f7f546d2f8.hip | 138 - ...36fc744dfb0d985c9113175e76c7ec1c935054.hip | 138 - ...742b9ac6749f189d597ac97d46d35189472c50.hip | 138 - ...d03e29403ad53d6d52e5e81182ea6ff5aff2be.hip | 80 - ...d41b6f578f3c903eb9d58ebfab62eb296044e0.hip | 138 - ...707d065ae152450f9def619ddc3dddb9089e88.hip | 138 - ...7ed4c885fb32a0b548186e56d64bab98071d30.hip | 138 - ...aedab8931f2eefb649b91e80145cb71b63360c.hip | 138 - ...e27c4081377f59363c2bf2ea8624217566d2d3.hip | 138 - ...0abf4e2b6be3e2c555c2134705b9dcaee617ce.hip | 138 - ...62968de58d9df7d687d671f37d63393f189321.hip | 138 - ...735b12d130ebf849ac5d6752e413ecf3e69fbf.hip | 138 - ...840be0741afa4d41fd4789c8300223fdc63ddc.hip | 65 - ...a53f7c6370845fa94aa9b395c52fd1900b62de.hip | 138 - ...fe77ca5c394a60af0313072cdd132216a52bf3.hip | 80 - ...20263fd84776f155519b3481be5e2c5b035585.hip | 73 - ...3c3bed2b584ea2031debf9f953f5f8f7012171.hip | 138 - ...71e663978dbcba859c5114ec675a712e343fd6.hip | 73 - ...8925f929a5b26f3544ca31938aa75b3c59d34d.hip | 138 - ...954a393b7b5a7131c13d0c4578443f468a738d.hip | 138 - ...a19223cf296d7fd10e15e2571e63c84a80fbb1.hip | 138 - ...a7fafd4227918e0c7f0c6ca3b2bd673cd07279.hip | 80 - ...b062527121e627871b3f1b2a94b96c42e51205.hip | 80 - ...c66c5b53f83bf1e023e81e9d51f0285b3ae731.hip | 138 - ...18ab272d7306689c7dc5a6d5326efea1471235.hip | 138 - ...49c01db99fce654e9351e711b113cf7424550a.hip | 138 - ...6f5e0b99814b0a82a731de36f28024bc317801.hip | 80 - ...801d21c14796c08377349ec86a6c800af497b7.hip | 80 - ...82d55544b5280b49b071ea277fb1827193fa2a.hip | 138 - ...9616f72bf16a060fa50091ac139ddc06bf9d88.hip | 138 - ...9f68180582384ba81aae2b1d4a4c52dde2c68c.hip | 73 - ...efa9c427dc278c0d1bc31189f683cd45e4d873.hip | 80 - ...204f6805d5d830aa6fca2a9b5f238ed63c3a73.hip | 80 - ...220f6dca850a5b5ccf1f619a267c40c37efeca.hip | 65 - ...4a9f10ebc51bde3f580ef527c17f89489c12c7.hip | 138 - ...5430cb65d8d540836c7f12b3367abd3c8e63d2.hip | 138 - ...8031345ea71cc17e458eb97a559b7c94d3ae43.hip | 138 - ...896aa9e4e4d7e494c1755b1e77a08e0e264f8d.hip | 138 - ...a44ac409e914c12281f1d26e5b52d8bfd0df75.hip | 138 - ...a9e92183ba87924e73ff0b5e25bd12d6038e69.hip | 80 - ...048a8ae1c0096f3372b0114c15edbe813425fd.hip | 138 - ...14f820b39a8ba81e547a78ed19a909ac13221c.hip | 138 - ...1da34ee666903307d3a09b7a032f2a70054759.hip | 138 - ...8b28f65f19e7d1b22fb3b85b7cf3d09cd54ebc.hip | 138 - ...9e0b97b3fece7c12504f4c8f1860d611b57269.hip | 138 - ...ab710e4acc711430745e05e036dd6a4d6bcdca.hip | 138 - ...ba7a5a0f3a714eb5f9f2af20f7bfbc82a30350.hip | 138 - ...eb2f81e73d65fddce7ff43c397da6529317607.hip | 138 - ...4d530731c7ade2c7beecfd1bbbca8583032217.hip | 138 - ...60621af3f7e1e81a8be48fea8d2750fdecbbf4.hip | 138 - ...76eb68c550b50b9aea42a7a2cc3bda186b0e40.hip | 138 - ...c411351ec59bdbed2590c599f9eddf7807b371.hip | 80 - ...f121a3c8928c10a2d86b487cd13fa995da670d.hip | 138 - ...3b3798f11997d33ccb58d90ed6c10d5411b735.hip | 138 - ...9336d59a8b35919e593217b6fd4314a04ea359.hip | 138 - ...a0ca185449a49fa485892fde6af745ba758167.hip | 138 - ...b3488ddf3bb1a4870371882f0a5d267bdfdf73.hip | 138 - ...c3c1e3dac623f07c2dc1b934ccb868cafcb38c.hip | 138 - ...cf03c0aa3f1b2a7b76b4e3418eb5063b982a29.hip | 138 - ...fe2db75cb20428856b02cd1cc8d7b393a6ad9c.hip | 138 - ...794d9c185b21f59274ac5d4db10a7abc0be968.hip | 138 - ...8552954505a2092662071401e135e84956c4c0.hip | 65 - ...910c8b7a30acc731948ab58467fdbe4fe32f6d.hip | 80 - ...1b49505cfecbe4ec3e5c7371de3aaaa85ac9d5.hip | 80 - ...1ffaf653085dd7f122d603bb3ba4b001e5f3c0.hip | 138 - ...2767e588220d0dc6137b00cc1d8dcc91e97134.hip | 138 - ...49f19deeaea20663bee781af7edced7f7a4fc0.hip | 80 - ...968bbf7e210911fcb95ba90c79837230ab1ce3.hip | 138 - ...a020f728df204ff51e37d2ddc21afb0aad5e7b.hip | 80 - ...be70b088b20fc8de464167c35745461ddab640.hip | 138 - ...f651d3415562206c1049b172261fddba01ea6c.hip | 138 - ...1828f15eec2a58be23063a1a8132d337cd26de.hip | 73 - ...67cce35ab784aa42ebcb75af7305bc38a8721a.hip | 138 - ...85dcec0197fdbb50124ab06efa627f1a2c0567.hip | 80 - ...8a4a8210a972bb2ed89d6ac754fb79438ab2da.hip | 138 - ...fb736c61088b8dd92fe0371f5c98e23bf9077f.hip | 80 - ...0e81c3700f130df142c9a37a368944ca548721.hip | 138 - ...3e8a33fdb7053760c9c135002b0a94facbe015.hip | 138 - ...7f4aaafd1a5b9ee85aadc6fab79ad0c27a2ea2.hip | 138 - ...8aaa193f332ed13e017e78ec07a7c80e45f6c5.hip | 80 - ...05ba47078abd7a5b6a51eb93b26095517e7f70.hip | 80 - ...214eb450c3b249017480efb8d092b0edad6dc3.hip | 138 - ...79ef43adffdb62100270a62706fb811963925a.hip | 138 - ...cbe8eca7e3510f5caa7f13419cfbefbf031754.hip | 138 - ...3f42d5c9ccdd3807e488b00f02bc6ab5d8d99a.hip | 138 - ...4b6226b355bf35d4d07aaef1828091f03ad2ec.hip | 80 - ...66604bb15f97a56847a7c968dbe32d247cbc13.hip | 138 - ...7b6781ffff9a42beebb4d73f0d15461ddd4479.hip | 138 - ...7eb3d86aa385f9ecffbc5ba10489e56856f918.hip | 138 - ...95543aeed81adfb6d847f78212585a36122ae3.hip | 138 - ...beb7b50ae6a1fc62535b9a1dabbde6f177a9d0.hip | 138 - ...f23d1460abfe875e71f7911697c42fef0f41c5.hip | 138 - ...f4c15a119e805e4407b184625f57966f8833d9.hip | 138 - ...0ef67ce0f178aa2863c4909f5bdd7f766c9b2f.hip | 80 - ...638314efcc4f16aa4a6e58e6caf2fda1711519.hip | 138 - ...ad2ed9f91bc1efd89ea66cd5c775fa140cf931.hip | 138 - ...fb7075345704340ff33dc0ef7c04ef127f26ad.hip | 73 - ...07bf9c05e41dcf2416e05dab4bdde17158db76.hip | 138 - ...17b92fab5bee7717bf9aff6a6bef7cee3816e7.hip | 138 - ...307974bdeeef95cca0d130ebb7aeb77fb1b6eb.hip | 138 - ...40d762ed576832b3a752453e9881b5fe6d2650.hip | 138 - ...470f5c6fb81032fcd7974180297d4bb2a8427d.hip | 138 - ...5aad18f59e47a3fa3278c7ef1a6372830c33d5.hip | 80 - ...b86621d626722434f2ae9b7b8ab435a8dd8827.hip | 138 - ...d707cf48a17d31abef94215c5720419faa0a39.hip | 138 - ...240106c771ebea461fc2a87b6da68e510aba70.hip | 80 - ...6a4475ea795935f4cbf2dc0ac156a33d754587.hip | 138 - ...7e1d245baabe2f6293e3d85318f9936b333500.hip | 138 - ...8cda718e10824956f0ee39bbb0891eafa45a7b.hip | 138 - ...ca9cd905ea8b0454cf9564643894682b08cb97.hip | 138 - ...ebd0c2fbfc85f938b10535855c388971129a28.hip | 65 - ...f5803b33d97db72eb8a8528aeb3fc956a938cc.hip | 138 - ...31b3345893eec8ed1ddf1d8de2512b46ff6187.hip | 138 - ...3d098f8bb63133924aab70d26a6ed64018c13b.hip | 138 - ...8788c537cbf6833c58a6ca15c0a36de33c9fbd.hip | 138 - ...88527a2cdb5adf51407f4661a254bb32d7de23.hip | 80 - ...a6478cc27e52fd9511fbff38369c921155cfb9.hip | 80 - ...f4605d82507fc4bd6e96095eaee5173ea41973.hip | 138 - ...f58a5186d69efd6062f3717bd315394ea6592b.hip | 138 - ...3246f1f53a988cf252eff88bdf814bd382d3ac.hip | 138 - ...586668a61ab88bc46b763df8f1c2ea52001ea0.hip | 138 - ...c8e45f6ea7cf5dba9eeadd0b19481d9f5defb7.hip | 138 - ...cf755f1485c065222be4daab84283a9c3d0eb7.hip | 138 - ...4c5369aa848021e020d874289e3ae4e0f74d77.hip | 138 - ...77f939ac3dae8749cbf4232dcf04d2cf63b48f.hip | 138 - ...a2d046629a4b65c90d0e18d061c4984062f844.hip | 138 - ...b6100efe30d836dab557ea4ac54c4b9d35c6aa.hip | 138 - ...dcbe9f481c92215f3b636bc0e86ce8f65e6472.hip | 138 - ...e3980331dc4bcec6ab6f4c345c7b5f71356979.hip | 138 - ...e5fb3544dafa9da03fd2de4bb9bd0718f6009f.hip | 65 - ...37ce5f3cf13ace3efc0b0227ae5a8c1fdfce1d.hip | 138 - ...4d1d4408196d611b2e0535bf8833652acbd6ef.hip | 73 - ...64e378e1ea1d4dd97f6949d66f3492883b663e.hip | 138 - ...abb25dba0c48b380b2dabeb6ab7efaa706d180.hip | 138 - ...09c38fc8a2d5ad6efd449107dc54a7509624fe.hip | 138 - ...44f96bed2f56793b1c2583485aa161cdf30379.hip | 138 - ...93267865f1c2b0aa1a09a586f54cec98eea4ae.hip | 65 - ...d4901b8ef034590314048de7223a572d61ee0f.hip | 138 - ...ec21ed6e040260c4f04ef68ef9307aa86985a7.hip | 138 - ...1401abfbbbdf0dd1d62df8bc3e85371ead71d6.hip | 138 - ...3176ecb1f0bc800c870861585edf56f88d7739.hip | 80 - ...4ec604c577a27e0aae5b39711a9e2eb82801b6.hip | 138 - ...5705ae121a1a331527cedfe4d31218a428a0df.hip | 80 - ...8a3d76e8ab73af9a5d2302d33e3b1d1b866dd1.hip | 138 - ...97eca4d1a18306b406b367653622a8d64095bf.hip | 138 - ...ba59d347ce8916a22b40e6f22a3c89e13db4d0.hip | 138 - ...d5f2aef029f2103bb419cc982cae99fd1a9253.hip | 80 - ...24904ac5a2040c7ea72aef5942212f291a21bf.hip | 138 - ...8b211174da0f398b2a093e7389905b4f9c4060.hip | 138 - ...96c14b8fee751d03f42ca48ea4f66e87fc2e2f.hip | 80 - ...97ce4d2e5264bdeda47487d5bdb55a014c6616.hip | 138 - ...a310a6eb86e3e8baac7a930c3ffbef372942b3.hip | 138 - ...c38912947881caa14b3fc7ab7bca317e296dc3.hip | 138 - ...f2010bf6c478d2f0eba77e912697661306c1cb.hip | 73 - ...f21e38ad01fade35b1db40adabd75eb602410c.hip | 80 - ...01e6aea44b96e94fb019501be6b102c6e6a654.hip | 138 - ...1bde840c0c8149b24a8f6f264e963c4e9e8ceb.hip | 138 - ...5940baaaa2ae6ade43ef4c94a220eaa63702b0.hip | 138 - ...674fc182dfa6329c73a354aa3adf458429444a.hip | 138 - ...704ca28a4877a1e84022e022614709adabb280.hip | 138 - ...8c80fd3ea17813df1bf19a158186834fd00780.hip | 138 - ...be322fc072ca19baa82707e260c6eba936ae19.hip | 138 - ...f884e9ca116ee47b446efe9fc770c178a858d5.hip | 138 - ...0ad1eb1b30ad8f1e7c17df486093129b2d5630.hip | 80 - ...200e875e0ef160b311c7de450c137772312d0d.hip | 138 - ...2016803aa3ca6ebe785557118365f9be7c4339.hip | 80 - ...26be8909f631c04d4395fa4ffd03a736f447f1.hip | 138 - ...28d5bec7941c9b6d5632bee8d67ed92b9c03ec.hip | 138 - ...64814a0de7702f0b7b5ce9dede6440603f4853.hip | 138 - ...a814291d8f01870274149b9d82fb75921d6e20.hip | 138 - ...d0223697ed41c4c2fd8830f8df6e5620db547f.hip | 138 - ...31ce329f2a0812ebb1dd103ea4ba8cb7ba531d.hip | 138 - ...38849e57ee9cd292e588f587a8079b57becfc8.hip | 138 - ...3ec08544591a22f59dc12f169b7327b4185a1a.hip | 138 - ...4c35fee4d372123631312f1051c43e1fa12378.hip | 138 - ...663faeb0425f45e8a0da0f7b1a5ddbee5e07e7.hip | 138 - ...72c45ba170f2782c4b5b75cfc78ac79a4cf157.hip | 138 - ...78e2a4d3b96a552e03d1ffc33debfd50c9f7f1.hip | 138 - ...e1edca5abe1bb3e7aa946eab6484b7bed806a3.hip | 138 - ...e945db4afa1330fe3978bc1bc9ae99828ae287.hip | 138 - ...f7e2a2c08cd87702793f91b6935cbe4c22be55.hip | 138 - ...7750ac0b18b48f56ceb4640256e9bd3a36621a.hip | 80 - ...93fc08ac5c6ce7a2eceb1227f4e3718dc4cf5f.hip | 138 - ...a7dce707954e765d97cb22e57d9bd6168860d9.hip | 138 - ...d0b8053ddf99a4d4447656d733c2da026b3a7c.hip | 138 - ...f182ae021e23869d7bebf2a9b4575bdc910ed0.hip | 80 - ...0ab620e6d62259a559e329460e46e6e3f7c3f9.hip | 138 - ...13d62a715fd717f0d4101f787349cb49cbe70f.hip | 138 - ...242e5953f44316b6a4f6587ec26283ed6cbcae.hip | 138 - ...2e032f6500fbc5468183415b6dd1d3e43f0bee.hip | 138 - ...890b126da2d8cfbf84f048b779cac2dd56b509.hip | 80 - ...902ed4ae3cc6558c73b730ff3949778007a230.hip | 80 - ...a14aa94d625b33df1adfa30ef4d91769592608.hip | 73 - ...b03a62e064864e1e9c1cd506c1b2e1786a777c.hip | 138 - ...df69b51f0a8cc9ae7e250e60df38758230fe4f.hip | 138 - ...fd1a756247b15b078d15a39e350a07c22982da.hip | 138 - ...2d3680c3578c7292349b58843aef7a82e0087d.hip | 80 - ...5680f97836be4a369802e8115617a83875703e.hip | 138 - ...67045d438a7e4b8f3a313a5df5a85f351c1be5.hip | 73 - ...7fa76609243a8709f349ffc0d9d88157f28dc9.hip | 73 - ...9a3bf1a9b37e0bd9bae6249609e5994dc0dba1.hip | 80 - ...b7b63e8a4c1df4eac4d978e166867195bd6e53.hip | 138 - ...19fc90e5a9c422dbf529d2def286f47dea0f50.hip | 138 - ...23dde1a386436e9864c8fa5f1706c0d2fbfd0d.hip | 138 - ...3d8ef4da515960bf40eb1feb04d21950ad5ae5.hip | 138 - ...4710e8f4e27fae4ae079f1667c3a1879cb6da8.hip | 138 - ...be4562c51d6829ec5942e11035c452fe318b3a.hip | 138 - ...dc419d4248dfdeeab1f0980aec35fa134e52e0.hip | 138 - ...08373ace7087bdaca4ce8b0bc329f553f88d77.hip | 138 - ...0f767c17385eb7d756cbe8ed444d7cef72dea5.hip | 65 - ...12e9cb599d24631c082e3cf65d2c58b6d4d44f.hip | 138 - ...2f87c021e0b6a27b2d7e30351fd50f06414b5f.hip | 138 - ...5667b27f15a06d4040354fba3601d48bb9c045.hip | 80 - ...ac5d4cf103d658e129673549549f1276f134e0.hip | 138 - ...d260849b86c46b685955cab54ba07d49b47954.hip | 138 - ...dd621da88c57798db1e689b93b692b6519ff96.hip | 138 - ...fe21ee27f8a0ca0407ef0dea73cd73ae6940db.hip | 138 - ...1bdde812c332c9fc58613698568a04771b9fa8.hip | 80 - ...332a6aeecfb12dcf70c69157fd3137343fb9f6.hip | 138 - ...6129eead18d13a4a6cb9550384fddabc7a2a16.hip | 138 - ...89f79217037e361bb0909d06534e40f5026b4f.hip | 138 - ...9519dd0d0f940fd5efd61bd32df7528ba7e3fc.hip | 138 - ...9c7feb747241c9c7de2adf3a19933a1c4c0995.hip | 138 - ...a9c37d92e344f3cc58cd4d1d00f19167e3623e.hip | 80 - ...c038393ec329a894aee9bbac078a40f57a4684.hip | 138 - ...c04763d635c5bc3e810737b5d948c59f117d5a.hip | 138 - ...e953cb24e28bcdc8f05783894b23cbf83bdf35.hip | 138 - ...6ccdb3c2d595fffd05bc5e6417b157276547fb.hip | 138 - ...80d44e82e601dc48d4c8b4e710ef7265894b6c.hip | 138 - ...9403cb91d6aabebf081afae94a8ba397d8d24f.hip | 73 - ...9bb3486fee7b7c9e24300b8a4e4ce88a11bfc0.hip | 80 - ...a76fc1b066a15b08dc6c24a7cf33a58b4cb6cb.hip | 80 - ...e409f4421193fb48a54aa5f26bd6229d23204c.hip | 80 - ...f65c7abd9b0d8a2df9302d6dc167637b3a72f0.hip | 80 - ...04763f674dfb3f14b66dfdeb2a046e413ce2cb.hip | 138 - ...07bf7ae1b71bf8ac4a793aa519ad333aa7a7ba.hip | 138 - ...21fa266c77e6b5bd1af2a9c22c686e5a6eac78.hip | 138 - ...2b21f9588d72c3c3e3b9a3b269f19c484d5aa4.hip | 138 - ...46f566fa7188c92568b277354e8b06ad382544.hip | 138 - ...6f9ab9baf631df1d3a8d801e4cf93a102526cf.hip | 138 - ...7545400aa6e70ff49a5f38ed6a218a180bd87f.hip | 138 - ...987e2d765efc320eaee813607c94c80ee35aa4.hip | 138 - ...a72d70d80b66c19e85daa00497308381050048.hip | 138 - ...bfb0e6032892cc58cef4dd403f305a5b76851b.hip | 138 - ...cf0997573f4bcfbaaf75e40f519580a7495a17.hip | 80 - ...efc341089a50ed5669b3c86f6ddd9b124d1442.hip | 138 - ...f51f0e178c33e6196df1d2e47bd38bf5391cc8.hip | 80 - ...fb694fce7b4c3c459fca43c89c6002fbfdaef5.hip | 138 - ...0dd4e870ceda3ba9b5f0084a4b025b2e609d57.hip | 138 - ...1db756577b61cde9fe8279d956980db9ee21a4.hip | 73 - ...3e60e8405aca3f7fbed19452ae37574ada9a77.hip | 80 - ...5918206483d2ae04a45aa67d69dfb986587214.hip | 138 - ...6c48e129a0235cb3a19124ddb28cce286fb368.hip | 80 - ...acf1d17650712b71a499bb66909bfcfcb6aecb.hip | 138 - ...bb8f13b6f20a72c9ce6d0b53f81eddbf05f1c6.hip | 138 - ...dd3ea61bb61de02667b14f5a94198f48c7307b.hip | 73 - ...f6c575c3fa2ccc7e65022f1ba65c8cfc16541e.hip | 80 - ...048cf91270631f98ac37dc488a1fb2e00ce004.hip | 138 - ...50f27341241086515d833aa53ae873d4ece3fa.hip | 65 - ...78845045d68027dcf3bf867ecde2fb12ec51d3.hip | 138 - ...ad0c0580516485ea432d98f53e73f6dfec548c.hip | 138 - ...c932e6eaaf44861c794539d9caf8b50192fc44.hip | 138 - ...d7f61e6313930f063758b61102e7a43b118beb.hip | 80 - ...f0f3d71108dcc49234a258f0f3b21ea2123cc0.hip | 138 - ...f1d7e1a93bf2fa80c409e6827ea88af56c44f0.hip | 138 - ...01bfc0394936a68fa0098580f06e77c88ebed9.hip | 138 - ...080406598df6bd3102db70a554e496e29db96a.hip | 138 - ...0e3532f27b391585d5de90f3bdf97992b67651.hip | 80 - ...52031044ef2e4a22e27ad04ab5d2c02121faee.hip | 138 - ...5a906031a258c6362313eec783678bd8125c91.hip | 138 - ...6a308c2d2afd6e0dfbfda61984b631c4ccffc6.hip | 138 - ...d580a612af85533c87aecdd7b0345c71b75980.hip | 138 - ...d920a76114c63156740ba5dd6f3846c4b21c28.hip | 138 - ...ddca2c6ecbba4314c434e7471ffb8fa642f936.hip | 138 - ...f6a1837a65df12b7c55d25ca28cc939c2a6328.hip | 138 - ...3e7888cba5f463d19fcb71aaaab25dc3d2c09d.hip | 138 - ...41910c34830ad2459fb85c2c14af02da718fdc.hip | 138 - ...57ea5726149efb8778e6d90798b8e48288fc9a.hip | 138 - ...7feaf237911478173377a501ee19ee325b012b.hip | 138 - ...cca7528c7d1bf49ba79625733ff0ae7522c096.hip | 138 - ...dc4af43de08130a04bfa06df9799b6e9e96900.hip | 138 - ...e8ae99e184013739019c93d07caddce532382b.hip | 138 - ...fc5e94f89d6a9287cf64662a372784511468dd.hip | 80 - ...13d96a66a4d9fb8dfc84afba7e1d8c200248a6.hip | 138 - ...156f2c556c6ef6180608c361b7b35ede71ffea.hip | 80 - ...4c8003a508ed3f8cbe6967c4ae2635a491c721.hip | 80 - ...908fe6dc9c629c82d6953081b10021e64583b1.hip | 138 - ...960fe542635079de5eca3c7785890cd4740005.hip | 138 - ...fdde4b25e2fc8cbdd46c2850c19eac8d9af8f6.hip | 138 - ...309c036d96367939ccc3e8922595ac35a3e179.hip | 138 - ...513d6e065a44bcb0c789eed1e7e5456e800ab6.hip | 138 - ...5eb90b1a2d64acc0f6fbe1d807c501fd4be3cd.hip | 138 - ...89126a7eb09d81baaf8f99dbff8932fbeab3cb.hip | 138 - ...d73393d0d8b769f30222f7817563a955c36dfc.hip | 138 - ...fa51b8c7a2f3fac5cf4cd2951ed2ede5c35450.hip | 138 - ...5b08ca602fe48840c72cd61798acb98540fcd6.hip | 138 - ...6a418fbe6183d0392b7a7d9986d067e323e2b9.hip | 138 - ...7e33463b3bf1853c6d2d2009af8d27bf88abbe.hip | 138 - ...93dc3217e154b65ebba065aa10ab4dc2374ae8.hip | 65 - ...e3a06266deda093bdf28af82d8666066157fc6.hip | 138 - ...40e8899b4e632714632450bcef001c6070f955.hip | 80 - ...ac7f6cbdfca2e397bcb86af4216e87166601c7.hip | 138 - ...c04463f9c5ce565a9daa8c22e16de80fadd707.hip | 80 - ...d52c5f70abb525b9c8aa8fc1cb3997c33ed67c.hip | 138 - ...ea5b5346c87cc4fc1e841c518080df4ab811a2.hip | 73 - ...ed7f650c958a644c8031aeb88688b1e42458e5.hip | 138 - ...0aa875ac13957f00b30210477924697abf0c9e.hip | 80 - ...617bdea526d12d6a33ed42b9b0018c0b173722.hip | 138 - ...a3327da9a3411ff1cddc67eb647083cd947a92.hip | 138 - ...1fd28acfe85b3adac859c4bbffa4d28fe634fe.hip | 138 - ...58d4bca33c4c0e79141a56688049237d170d1b.hip | 138 - ...824621a50cdc3cbadc4b1f9ef18e1325385082.hip | 138 - ...980749c6b2a18c80426dd189e5506334343ca4.hip | 80 - ...dbdcd28cb2f078f89adf9aad2b3d4a0a477823.hip | 138 - ...17c082f249649eca733a8f0cdf9a1205c3e3d7.hip | 138 - ...9043572cabb65435627a3faf23b18d039bbcd8.hip | 138 - ...92990df507e82f96eeb7aa3ec00c01437566fb.hip | 138 - ...d1a40b12ce927323594fcce61eb9c20cc5e3d4.hip | 73 - ...d7b8c63a51c8639b3cf27ad09d41ae47c480d3.hip | 138 - ...074afcf33e3f3534ac3577484237fcfd2ca48e.hip | 138 - ...13c4f3f645a2bb475eb1c55ce1de452f0e2332.hip | 73 - ...3bd4e029bba76ebfc79e6522dbc8ca0bba5dd2.hip | 138 - ...4688cbd23727dd0ea9a36fb977b31aeae98d65.hip | 138 - ...7970957024de050748d3e31cef434f582d968b.hip | 138 - ...dcdeb845e7bcdb89ef70ab2a97157d4db3cb52.hip | 138 - ...f1007430da272174d3476d042f398627e83512.hip | 80 - ...079c1eb36db8461fa8b861c56760afcd97cc34.hip | 138 - ...7549e66ef309e32779ddc2a1f14e79bae53754.hip | 138 - ...79fe8a600c3b4e0ec9aa510f8036ba2b608985.hip | 138 - ...a8285bd6182355e3164cdc5a983375cdf0a61d.hip | 138 - ...1b48a28b71c7f4c78eb14321b39951a7c5e903.hip | 138 - ...2c587db8bd9f1b551624e0cf8b67a90245d7da.hip | 73 - ...2d5f979fc4fbd0991581a020a414f9c8656ae2.hip | 138 - ...431313fe082958d31b68d2fd0d61df0fe56736.hip | 138 - ...50ea8dd480012cbe10be392cd26d1870e6ef9b.hip | 80 - ...675919a6c7758cbbeecb83b7ac6c62f95cdb46.hip | 65 - ...812705ae3e452810794fa7caceef2ef6066dfb.hip | 138 - ...816fcad5e9ecfca94a6491eb2274bcc41e558b.hip | 138 - ...938d0e3ad30db201880642e57758285b2ec4cb.hip | 65 - ...fb5fc2ace6839eac741c5e6616665845f43566.hip | 138 - ...607ee20c0d92b6dbd0338f139517fdcce98d0c.hip | 138 - ...6e463eedd3e65b9c79feed3cd92ad8cbc9f036.hip | 138 - ...7166d4bb0c1c9b9999ba16a1adbf09ebfdb6f1.hip | 80 - ...a4c40e244b412a07933d369704bcdaa6d5e74c.hip | 80 - ...b224b40a7be7db0a9c5c08cc5ab05b526c14e8.hip | 138 - ...b33fc20f2e85e915f1b1529ae87981dfcaf86d.hip | 138 - ...c08b4f3959a2375ac03f40c4ce12d70cdc2d80.hip | 138 - ...09b7d39346537aa6c4a4e46b81139f603edb60.hip | 138 - ...0d7f81c73b35ea64095d01c5d48d9190839e0a.hip | 80 - ...68ba8df8b0e977e9769f6acf6cfee6b00b9922.hip | 138 - ...6fa8bf5e992ddc25815486ae9c24d8bfba7227.hip | 138 - ...b17d8cba28cceddb3ef907df878aeef0762d15.hip | 138 - ...da0d469cca5c8481504148468460c85a15c559.hip | 138 - ...e5c56e92712d00092ba102a5eb5176a3e5d471.hip | 138 - ...0cb8bd09d287a1566265eb1e8894fe68d3cc81.hip | 138 - ...5b75db795dbef037b14b003ee073665fe35d3e.hip | 138 - ...63ae070075f26926a86d39e15c27e6edb1f1cf.hip | 138 - ...695dea4171747fb3cc6d910459f800608d07c1.hip | 138 - ...9ae177b7a793fa352c4f6bb8e4175f3064d814.hip | 138 - ...a6200e36944b1f11106c02f7fcee053f01ee71.hip | 138 - ...b9e2616c2fe0480096b1ccf0f74d584b220146.hip | 73 - ...c916e14198f6d18dc89915e379b01070434e91.hip | 138 - ...07a63fc55c411c73e4f93306c5ffed800dd249.hip | 80 - ...121fd448b4640a17e1a7fe73bb7b58714c0afb.hip | 138 - ...1f789d619db6f225e8e9d646e93bbc9dc1a669.hip | 138 - ...739f4464512feee083b875e11e11eee4f5b448.hip | 80 - ...992be6252f2afdc368bd4baec4b8a55ae0abf8.hip | 80 - ...b0770fe64e3c60b9e56170aa88bbf74802a813.hip | 138 - ...b722cdabcfaa388ccc6ccceb7e42462f3bdcd1.hip | 80 - ...ba64cdf615c1be2865f027a293cb530fc07dc6.hip | 138 - ...d841e6d783bb46d841aafd9027f92dd1b61b88.hip | 138 - ...e53359c69bbe4d7405d45261a8a62008eb7d06.hip | 138 - ...f9ad0fb65638cfffb3e7786f2cbf01d9585b23.hip | 138 - ...054acb8a9508fd0f0f486367fb62454de47c39.hip | 80 - ...1cf8d05cfa45319f4e5bb49334d35a530bffcf.hip | 138 - ...728d999ae43ee1b5a16e60b90cf8533c7d303f.hip | 80 - ...7801fbb43fb6797f0425f08d13926b74d87c4a.hip | 65 - ...7c48d0b7096ad6c8bc445f13f2c8c1934695ab.hip | 80 - ...b885d6869400b0dc2ef1b2c2636ddfd21cde31.hip | 65 - ...2439e4f5644a3a4630481bc7d98834b29b6e1c.hip | 80 - ...a94d145e575747c8956ac703810582c819e2e8.hip | 138 - ...aa519eb57e5797125728492d9330f5c0f0670a.hip | 138 - ...f6f9dee9f0c3825d91f4d320a5280070e60ee7.hip | 138 - ...061acc6650fc7b79fa1fe5b2b1e083555eec2c.hip | 80 - ...1343832a5bfd060c8d12da0d8a090f070a717d.hip | 80 - ...45f95c1093c60f0fb6c794636f79aaeb53b733.hip | 73 - ...530399ad7b43d8ce2c89da24c71056f2146b18.hip | 138 - ...83148fd684a7e6a312127e023798278415bd27.hip | 138 - ...94816877815bc0294610ca24f986fdccdc7c6f.hip | 138 - ...0ecb3013071fb65f2d5ed4c947c4bf303e5308.hip | 138 - ...38c9618dbf2af119e37596f7eb0fd3f8d72748.hip | 138 - ...3986150adcd6e1d3886bacf2166de1252e14df.hip | 73 - ...4f916d3484295b5918e2e4c22c5529588a5662.hip | 138 - ...89ecd7bf51bcffe9f5002959bdda41c50a3c8b.hip | 73 - ...8fc75a7d102aca068e3ceb6111728c280fa837.hip | 138 - ...c129dd4c798343d6f78ab78056f0faf2f1c9d3.hip | 138 - ...c5e79f54b71677124f555b0ae4bfd27248d099.hip | 138 - ...caa2056d99eb67ada498e287b4fae984397691.hip | 138 - ...dee49ec6755006d67f0c30c65f50558bba69b0.hip | 138 - ...f1bb85dff8c97846f6b2e8796a6289bcd0d9d3.hip | 138 - ...0073c70133ff2ee4737f803a0ac43801c47242.hip | 138 - ...1a08c2e48d805b295d979b24173a04cf58def0.hip | 80 - ...246460c21bc66c0f13936d27477a9fca1c44d1.hip | 138 - ...45b04a8026a01828c5dd606d89d044d3ed1d99.hip | 73 - ...6cf509d9c2bf86ba6ee5ded544fa8e6717f590.hip | 80 - ...7137b371df841993c8d0584be7d83aca6add78.hip | 138 - ...851d5ecbf02f8af623988b1a39c0b91e51533a.hip | 138 - ...01b25e0f132d647934deb395b62a3f70cc7c88.hip | 138 - ...7a617fae00fa90a1ba60937b0312c81087c19e.hip | 138 - ...7f00dd759d9714693e7517dfaa8bb427294d42.hip | 138 - ...93336a4b00b2a63f23ed7e13ec54c82d9e5063.hip | 73 - ...e484adeddf3394d8d7693b808d83b64c71ee69.hip | 138 - ...f5efcd500ce6b9ffc14bc9877e0ba457539925.hip | 80 - ...f9a4f4d85f292b78123599a2e1798f12aa545b.hip | 80 - ...90e6ad243a48b84304b5cad0c663c0802aedfd.hip | 138 - ...ae680eed89ea93a3a94586bd5a68dbc5439f37.hip | 80 - ...e2f290b962f1617b0a9d4fd6d55c43e4439d6f.hip | 138 - ...f8352674bd6bbe98944a1c0a769a4fc028a623.hip | 138 - ...0a70932bd587759df1e5e150b25b0126d7b529.hip | 80 - ...20fa19d8d30654602e363806f559113218d66d.hip | 138 - ...8e04fe9432a60f86ff0369e8c1851821074a04.hip | 138 - ...9edbe35a8fac7796f00bde836bd547044770ea.hip | 138 - ...b73ea77ec20ea3bfaf995dacf93a6960ecdca0.hip | 138 - ...d1f99284aafc8d7908d062f179a056eb314925.hip | 138 - ...e866c7db36286876818bfb718ac35204fa3843.hip | 138 - ...fe4b6f3b901ff4af81bd4f1cd8ff19f09d0b07.hip | 138 - ...062dd633645772e4f2caffd111af73184f7657.hip | 138 - ...327f0fa1155f2235d76be45cd22e3db5a69429.hip | 138 - ...4dcde1ae3446b825dea739d4295c1d1ec5c4be.hip | 138 - ...6d08e63b9a90f2524cbfa8c5fcf8b82a1d2d36.hip | 80 - ...73c92a13757877f34bd8a13c6fb29b60999020.hip | 138 - ...841b7cf5da31f0c30ec42c91cc8d5bd3fedd03.hip | 138 - ...cc791049e3ff9ebc1a9085d2d20efcc2f99b71.hip | 138 - ...f235679af1ca03a6e601b4cf6cd0416d1c9091.hip | 138 - ...4fc7cda4b560040cec93f63021b529aa1ee3fd.hip | 138 - ...a3b1d36d777213eb381b47871bf15dd163c994.hip | 138 - ...c3ef3d3b36f52089548e9dce522b0448e2c26a.hip | 138 - ...3d274058bc0a3d4d35d90669587761fdfbdba1.hip | 80 - ...6759d8855c4c6289f1f241a1628cf0406c1b64.hip | 80 - ...69d441f48f9ea346dd8e00376a9a708da3ad87.hip | 80 - ...c424f0e192155e3c4e786e5b87d5a1a3e6c4ad.hip | 138 - ...51083e13aa4dfa8c969f8f916835a8e5e9ca39.hip | 80 - ...ef1b54d5d3841f3fa6b84cca6c7ad33efa2d9f.hip | 138 - ...0517550c7a23882b95de451e8099ea2186b4ce.hip | 80 - ...b389d4b5ba590baa951f17da06f0e53d2bfa55.hip | 80 - ...17be7b8bcf303b30a147f41346898acc5fab7d.hip | 138 - ...2a71fdd587e47ee68e0cc76c3c4494ce06c359.hip | 138 - ...2f152e9184af0b3d77082d8bdf519dbbfceb2d.hip | 138 - ...46e888e3836b0bd3c49fec8e1872e880798f0c.hip | 138 - ...874fc5ac87a1ec487c7722bf3b1bdaa924ee09.hip | 138 - ...94599fb5caf5e7aba728cd4713a8d0c6368a46.hip | 138 - ...a556c9358ddd6db719458c81d2d6d822a895da.hip | 80 - ...03cd47156a98ad2cf2c325ea00df3f1d67fb72.hip | 138 - ...89292c81a18d21a2921ce6740f81ebf4c046ad.hip | 138 - ...c71e7d33f0597fe090a3524e33e18b2e562680.hip | 138 - ...cba1509c413c870c5d784410855ee1bd737da2.hip | 80 - ...d6ad9de7ac7993ae1923a2ef070b7dacb8c563.hip | 138 - ...0c91b2f11bb7e5058ca7935b0bda4f5558a9dc.hip | 138 - ...1f3637624762547af1292e1b85e640b1d329dc.hip | 138 - ...25c4f1f3c7b271957768bb9235131c67afb48a.hip | 138 - ...482a64659c838f3da55f56e3cbbee1dbfe6722.hip | 80 - ...5e2aed617e1ff31f93ae7e054313ee0dceee97.hip | 138 - ...a715b7e9c1a576f011dfe5769c5b392e984f82.hip | 138 - ...ef5d30a2318ae06430d17f84878800c4ca7364.hip | 138 - ...339150d8bf9d073827738527f6cbe15b854607.hip | 138 - ...709e4fc53d2254a03ea7660b8c72d2f47cf1ad.hip | 138 - ...88a284f45f711d82a6ed87036d87cef1872eb1.hip | 138 - ...ac4f93722dc314086f1b7d7b8adc687cd75f82.hip | 73 - ...d7aa46528ee74e2bef1e87c1feceacfa55e173.hip | 138 - ...dc780b17152f696f9b957432c2eae8fb16e85e.hip | 138 - ...f9c236d24b30bc9c3fad90cfd6eb00da835de2.hip | 138 - ...ff8445ba691807caadd9f26e7eb90851875280.hip | 138 - ...21c2ed6b295c458071f1988b9d6f7b46e8992c.hip | 138 - ...700d87a19a173e84d64e43cffabbed52366e35.hip | 138 - ...87f617c4b84c6a0328fedac750d41dc3dafe27.hip | 138 - ...8843d844f78690c7a45b730652f0f763c595c7.hip | 138 - ...980becb0d3149fee575bad1fc3b463d08aabf5.hip | 138 - ...b7f10440331a8a88ff93ba253217c2832bcf9e.hip | 138 - ...5b47aafc4340e69e300ac61a7601a5c14513b7.hip | 80 - ...5c7dd576e5b1061c059e5e99aeedf4389e2d25.hip | 138 - ...9423c095db052603d77073d409534bceef425f.hip | 80 - ...a7833f4597bb03a3e845d5580d677e97421040.hip | 80 - ...bdc110955c05c6c6ea236a6f60266a4a6dce5e.hip | 138 - ...c0109313de1f6245d2a80f8539485b849e9d55.hip | 138 - ...c4dc0d70c547dbbfb661e879ba7f9adfafc2ea.hip | 80 - ...d4eb673bafd81e3a0ee213da4603d88b8460ec.hip | 138 - ...e5cae764142683b70d3344cf07dd1edb7d69e2.hip | 138 - ...f2f0cef657ae5e333d65ae4ab20529a43cd7de.hip | 138 - ...f8b7b2a891aa9f2ab49762eb31d835efdf18b6.hip | 80 - ...fa94bb32a80e81886b711ebfcf2df5f5405866.hip | 138 - ...22fa57764ec746e02f6d4bd4846b48c722b807.hip | 138 - ...2a2ab489839ea1a1bfd1b24e54a3c232ed934f.hip | 138 - ...461d72fb6ba50e81de3f661528c96dcfdc3f3c.hip | 80 - ...4b4cf3f6706e4b4e0af4402e2263b9a1585f9b.hip | 80 - ...5c43b870705c780d734f9ef063f55cf8b3b52d.hip | 138 - ...73f35edd69241c6b921d6712dfd064d78ecbad.hip | 138 - ...1305f191f06cd53b7563971c706e8b71b19e2f.hip | 138 - ...4b0e7dd816ad08eec5a1bba6e227afee9813ec.hip | 80 - ...784b03ad757d51c234fa86ea9891f055ecd5c1.hip | 73 - ...8fecb9725ceb4bcf2aa037d43bc43efeb1c3fd.hip | 138 - ...f7553a7d2f6d42fe695cdc64423c85223af440.hip | 138 - ...21661d8280c6e9d27f2c9ce1b3c855387b5a76.hip | 138 - ...5d35b2fd98742427930eb536e346ffb005edd8.hip | 138 - ...a4af070ee46d802cb11086b93daf91538f8a04.hip | 80 - ...a744edfa3a19d1493611df5bd0d4d59b707d43.hip | 80 - ...2b43d374642df991edef1f6036dc898bf77cf8.hip | 138 - ...3324ccf11b273ed20fd960c61df897c8890b1d.hip | 138 - ...3a03b33305b33055273711ab31a5b8d8298d5d.hip | 138 - ...68df29f5ae1463706b7981b3bde55918e1aa65.hip | 80 - ...8925d99dc484da41dd55700e151cf545cf821d.hip | 80 - ...b50c6ebb27986ce5b378d8c39315eb9cb91dea.hip | 138 - ...d2be18e2d53a5144f97dfdebb225fcb6d611d3.hip | 80 - ...df9ac4ee78e5f4d5bd0567e58a7090907c61e1.hip | 138 - ...f00f270680de81df7737e848e0408cb070e68b.hip | 65 - ...1041530f794c7b8dc4a8321ea0fcdd338fff35.hip | 138 - ...522b43c5e5ea69bcabb4c0fe28def2bd081a12.hip | 80 - ...6d13b09f85ee62bb5018608812181fb43afc86.hip | 80 - ...82d20635e592edbf00439294835f6f39ad54a3.hip | 80 - ...996b9c843200a2ec33ed4319b48106cd7c6384.hip | 138 - ...fe891dad43815e635f81225705ff944f990d75.hip | 138 - ...09941bddfa9d61985b55f9b6bf0edec9bb89f6.hip | 138 - ...0be5a2072b5e87f5ee58149688796b6513219f.hip | 138 - ...0c3fe9529e24327686070731d0ac3ada76245e.hip | 138 - ...1ca4ce061f7f69a250356f613cab00d1e2ac71.hip | 138 - ...1d7f93427095e39bfc1d986b3d7fe54073ec75.hip | 138 - ...43f4a56c166dad0113f51b337a083f4df7cdb6.hip | 80 - ...56e886d53a1d88fada0f10f00b9f398dc54568.hip | 138 - ...6cd5c9242f8278c8f3d9ce57b97d605c7e5a3e.hip | 73 - ...877ae2a1aab04498bf2b26b3fe99d6488ef151.hip | 138 - ...f6c6412f9853855b74a96e862935ddef66f763.hip | 138 - ...f92a5314fd33491b5eb6ebd2418b7e0d5db774.hip | 138 - ...1ccde31b47e0e56ee0daab6403fed7895208c7.hip | 80 - ...5e9aee85cd16903bf7b82a4ac10402b0b26e22.hip | 138 - ...9382cf8bb56ffd962c99329bf67da992f8810d.hip | 138 - ...eb0641213e9a45ba48bcf72bb23845720d8b79.hip | 138 - ...091c69d19b27f7ad50ef6311532ad8b642a9c6.hip | 138 - ...82071cc074fd30437f6158b5eb2c6df1f8c587.hip | 138 - ...989d2ce769f20e175fa88f4082c1c25fe03062.hip | 138 - ...9b99a194b59d3149842c15733394da275b12c0.hip | 138 - ...a016be2bd0e377fbe01fa7adb9bbb8febce100.hip | 138 - ...ae2d4f8b2dac799e03ea6f279e6ecdf66f5381.hip | 138 - ...aef10ff2c5d89530310bdf1d53a194f06a94ef.hip | 138 - ...d29e3e9828911a117dccaa5650e77805730d14.hip | 138 - ...da7ad787524e3e47dcc1b65c41b2faea38f55f.hip | 138 - ...db6a14043c5a4df0f5042b3770b40c4e90795c.hip | 138 - ...f160741a4f751d2f15d6eb23d4121cdca62b55.hip | 80 - ...1ab1f4bbe86bb9bbc22e4774648076c321136f.hip | 80 - ...1afeb6cfdf860ff08e4c2f11c922fd5bfa621a.hip | 80 - ...239476d61f48379754b97f29d7a285cc3192de.hip | 138 - ...4e7253ad4873576052ec0a9400597bb7975753.hip | 138 - ...4e80cb185759dd9b3eb3c67c239964b3694caa.hip | 138 - ...51b30c7e1cd30e550187458350c8db7c59a9ef.hip | 138 - ...7899b1ef159ecbf01f27014601eb79b31b49b3.hip | 138 - ...87b1d5c50606430b544ed650d87df24366e7d5.hip | 138 - ...8d0bdde763e617beafc0365ec4a3cd11df6c55.hip | 138 - ...bb2441e6cc1ccba4a391566e547402bcf7ced2.hip | 138 - ...bd5fed34ebceb879ae3dffaf58c7c04ab5fe80.hip | 138 - ...bff7e6605b273bad844b8f70ef031625bff48e.hip | 138 - ...c87e65afa93e84d7a947c52f291c1c7360033c.hip | 138 - ...ce14f7a220222eb4ce6783ec2b9fce6fde94b8.hip | 80 - ...06c0dae15684f83e15722a4c07342af9ea011c.hip | 138 - ...6ccfa11add1ae49888337e84d9c446d2f67da4.hip | 138 - ...adc4f76e237514db0bc0203102297b79730bd0.hip | 138 - ...c4b47a6fa62a4ca5cff6a7e01c9f6b371d2215.hip | 73 - ...cafd07c1f56e74373ccf37db35976023456d50.hip | 80 - ...ccf699f593c828e11efc053b144044e45b32d6.hip | 138 - ...da8f46b5ded4c2aa9d722fec17b75004b59f7d.hip | 138 - ...dab954fd111ec48721f25710d61c0c8affd8db.hip | 138 - ...0e062055933388e37525df5766f3c14cd3538a.hip | 138 - ...1dc872c24db4db0c9179fc07e17f41060390de.hip | 138 - ...3ab68e33844f97aa58d463e00037bc11c50da0.hip | 138 - ...4f14f829eff73afaa57a875f74ebd1e6860979.hip | 138 - ...544a38dfdf4d81dc95894387845f48435e299a.hip | 138 - ...dd965d5d9080ed5c6a04b7eea9890f3a264f20.hip | 138 - ...f555b74ed36f1bef8f47880b3edc6760f27788.hip | 138 - ...766695dbb790bd614b83dc7569ad449404cc89.hip | 138 - ...8a615e66d7cd739ce35412811359a03cb23a8e.hip | 138 - ...92c55f002d8540d5f965cc4df0c2e33f4b9ff9.hip | 65 - ...9f05f6848403480ba41d37cdbf44ccca1b1f8d.hip | 138 - ...ad101ce91348266d3885afdf2996a0fdb72135.hip | 80 - ...c5d55d47d6038e9162d32ac968ff58c0942938.hip | 138 - ...0c6252863a73341b0010191fad4c834860f884.hip | 138 - ...0e314642cf565e4f32bceffdb5c0e653ab627b.hip | 80 - ...4f91dec2029b25d0d96962528410df55a468ed.hip | 138 - ...85e2f1970b78e18002464eeda63798229bbc3a.hip | 80 - ...98e213f927b518c693660110f08bdd94990ef0.hip | 138 - ...af5f5b5ee3ae964824a3e9c7bbeb5bb39c557c.hip | 138 - ...f91e937b427ecc932c0cb0c90b2c2378db0be6.hip | 138 - ...063d06723ac70c5f8802ab49c5c35e1debf56e.hip | 138 - ...1f56244076c501cb09b4b90975132cae4c4386.hip | 138 - ...486244e0b7d6dbcaa1951e8b8883ce441c3f99.hip | 138 - ...4c1ce348c3d9cdf6bbec9758de9d5fe94c43fc.hip | 138 - ...8a1d3cffae01332a3a9d9472ff1b2c443e82af.hip | 80 - ...a104733f678193068d8642d6560faa03897258.hip | 80 - ...da22d3482738a8474ae15e8e5fca9020c4e195.hip | 138 - ...1735d250b5a16967281a5f07873b9cde3df4d6.hip | 138 - ...1a30092e8138877c1f6c25656e0f8ae2c2444e.hip | 138 - ...1ea5293bc1c56efa2c4b5681d965aa6f2ce6c3.hip | 80 - ...588379eaa268d79fe8f8e4457b009f204a5fb7.hip | 65 - ...93c99888d82cd2852bfb101f99a2e6a27665b8.hip | 80 - ...a5715b550f67b8870ba66e1e6282a26cc1dbf3.hip | 80 - ...b037a2e262d11d3ed7d9feeb41b9e05427a739.hip | 138 - ...bd2d206ceb237ed2c51f58abb5cbf96e39d07b.hip | 138 - ...ec377c44ac18527ca6a01bc3b146706a6e1e09.hip | 138 - ...f12f10d7b968e0d8e7c23f36d3a360de74a905.hip | 80 - ...0e6df20a2426abd3d2ff2262a37c009196024c.hip | 138 - ...13834918d5ea789e2db21abece7c2d3532a7e7.hip | 138 - ...248f443a12d96815c04409a00102923c717023.hip | 80 - ...371415448fffffd58bf014dac9f4876153657b.hip | 138 - ...ac596c636df55e81293228cbc53dcbb3024e5a.hip | 138 - ...ba2e73df35f6e0f7317303823fde92a42b1a35.hip | 138 - ...bccc85f74f54a2ceb17fe3040b04fe306c53f9.hip | 138 - ...c3131fb8e5a25bd4a14bc9075eb6fa01b61d02.hip | 80 - ...c7fca1f76a31b0390e92d90d569fab94d4f783.hip | 138 - ...db3d5b1d8af89381fc4b8073f84c5fa25fdef5.hip | 73 - ...0a4e87a7aabfe3c1ce02b408522f3ec862e3d7.hip | 138 - ...b17ae67adee9e56a022cd2a5514fb9c4e99920.hip | 138 - ...2a804bb3c99830653d41ac0bd49943c801b89a.hip | 138 - ...37410b404a51043fc3bd503c0b107c297e4c9f.hip | 80 - ...5843bb13058ffe29251e053800c509c7590544.hip | 138 - ...74450ebadaacf23e944aaf8ca90eada01e8a5a.hip | 80 - ...79cc0b0380e1e6a2b51fc6216fdd72215b882b.hip | 138 - ...a03ab0b7887cc7ed0cb40e56360a8d36c0bb8e.hip | 138 - ...0d0828ba6d24ea3c1a97bd9835ee937b4b32fb.hip | 138 - ...72f9e6ebe330cc1818ea82b53acec79a2f672c.hip | 138 - ...fbc6f6e9c515edce3c7a438b3bc308b30d3857.hip | 73 - ...385db12001110c42eff6aabad935a69ad3afe2.hip | 138 - ...559dd36a0a4f5e068a722e285f485137bd5ef0.hip | 138 - ...627f9c8d0088df0364a64643f2b5dcd951f2bb.hip | 80 - ...a742ceeb6736a2c8f9439d0b05e10d3e0c5c6f.hip | 138 - ...baf70220079e6d4e87eb01a7259923d8a01e29.hip | 138 - ...d00ab8373747a5c6b9d2f8dd50ceb14db4163c.hip | 138 - ...ed0a64deb55616646ea98b21a891c971cd98ad.hip | 138 - ...145535e53899fe127987aa854f81234a9c51c4.hip | 138 - ...8b09f0aaa40a7c9ad5f0458b460d3e328f3c74.hip | 138 - ...fbef3f13d429ec3e9f4672218998d5669d79f2.hip | 80 - ...111b7acc269f8d5e70915d3efde4c425aa5f5c.hip | 80 - ...28a4e95723e3df380f98b5ac107c4df353850b.hip | 80 - ...35c86443cc9ea38c06ebc0656306483c95ef67.hip | 138 - ...a10ecb79ede07324e1198a71a95ff26e9eb235.hip | 138 - ...e23201fbebed25781f249e5c77c31e0e7f9ddb.hip | 80 - ...fd025488e52b97c04995c4c5faff371b77e4d6.hip | 138 - ...1ae1dddb8cc5d78196da6b26ebe66c1ce7e567.hip | 138 - ...238fd2095b26a167b41cdec8280182330b7b25.hip | 80 - ...4425e30a0b17e8b31726817e8d3177b5c51934.hip | 80 - ...4e0f0496a34d2fb43c80ce0162ad4183f29064.hip | 80 - ...6ce17223d8d83a64b8c96ac88223e4441a4692.hip | 138 - ...744db85d4237ee9640f1658e0caab7648e3bb6.hip | 138 - ...79e255d25744725e2a9db9f90d5cc2b8a0e0c1.hip | 138 - ...897852a4ca992961843144f4ec4f8b86dd5e9d.hip | 80 - ...b6f0730fd09b4c6c60913425927dfdb8f83d82.hip | 138 - ...d7ccdceb7baf3b986f2a0248827822a5f72e47.hip | 138 - ...f8836c8cf932cc2748e313885003f0e11a887f.hip | 138 - ...064e302ff5b983dbdb4ccf51383fb29ddff44f.hip | 138 - ...28203f47b6a48e9b66302cf8312f3796ca500c.hip | 138 - ...37f4f7914805a97d5073f1ebf8a8b8c2648d31.hip | 138 - ...3daa5f99b4522d932334924347353ce2854821.hip | 138 - ...6aa39d0ae3c87d011610cdb5e2e317f337c454.hip | 80 - ...80a1774d8b7d8bee4e8663392b97cda11dcbf5.hip | 138 - ...8bf7c572c1984ca3061062cf3c31d993f6762d.hip | 138 - ...9c47f3305e47db6ab6bc627fb3d80269633074.hip | 138 - ...ab172627718278a71a93e3737ef08ad9259a4f.hip | 138 - ...e24a8dbe6add6f2dd2beb48b1280f3a84a9b2a.hip | 138 - ...1e1533fc37b41838bd37edc2b6d2f2e76ae1c6.hip | 138 - ...4dd90ccb2f258029d0156cf23f940b694cf08d.hip | 138 - ...8ec1163a01b9cd9a802d8b44669e8770c20234.hip | 138 - ...ae876d6da465687f162136231f15767cc7bb14.hip | 80 - ...b9afccc15de7dfcb2e7d898abc0d61201de73e.hip | 138 - ...c30e7107c5dce3fe6aa87d83ed96da75478da0.hip | 73 - ...c9e4c0317e8d351f60258ed6611fbf365c4024.hip | 138 - ...cc2a4d7ac045365300bf8bd45fc6d3e1e1c8b1.hip | 138 - ...d5a8c5cf683f6dfaefad72c2e2f5c2f2b2732f.hip | 138 - ...f3bd014a918feddadc98eed92a7734f9bcd890.hip | 80 - ...9cdf86a7944cd690b0fcbbaec235863acd10bb.hip | 138 - ...338fbc05f86270ded7df2bd3e2758a03961b62.hip | 138 - ...342686e4efd26413c6719782ed13603479c4e0.hip | 138 - ...63318cb851ccaa923be12d34c84d839bc64bb8.hip | 80 - ...8095341ca7e3a1debeb780c1878e351692bee2.hip | 138 - ...a3c4ac0a50bb9b7ad764929dbee98c856b1210.hip | 138 - ...f76aff077c28f8afd7b22f284cf2894e08a043.hip | 73 - ...12c01d201c366bdd7acccf2e1b18b00f671153.hip | 138 - ...1d68fe766fc753c657362673704005b538660b.hip | 138 - ...37c03bf161b2ec6a9a046fa49d7bbf80ae47b8.hip | 138 - ...97d1f050f42d82e6851fa286db6f81ba197f40.hip | 138 - ...b76bc7a17f573c0d52c07ae9ff4302662ae61f.hip | 138 - ...b94e19d762ddc33cc4e94c6675d93cbde21e3d.hip | 80 - ...f40c3421b9ad8cf43940530ec50bcf620058f2.hip | 138 - ...f721a330b2d0fac13b22061616d7b10c0f91e9.hip | 138 - ...50ea59ab6e1ee39cce15cbd3f181047cdee31a.hip | 80 - ...541b6b5cf27de3f45f60671d36602f07ce1783.hip | 138 - ...7b3026f1dc3056dee3a3e64bf31c45683607c9.hip | 138 - ...8de8f96c8315877031a2d56261e95fee6aef44.hip | 80 - ...9110dd501853e87ebc122dd1971b0bb1bcd92f.hip | 138 - ...940fd05efd52bdf8a3f9aa4b78bde9b5809b34.hip | 138 - ...a2856bf9a81544a30d535a13554e3a8107c476.hip | 138 - ...b719893a4d8a1e71857966d399f06c0a41749c.hip | 80 - ...f04447e6a94c94a2315454e71d7d607a9fd0f8.hip | 138 - ...fcced07cc194a8050bc7b2f791453b3f5b2064.hip | 80 - ...23a4d1f24d59bddd20ed2f2fb6446627b0ae8b.hip | 138 - ...55189ade9b1a8269230232db754a3881b53168.hip | 80 - ...5ea54eb6cd0f3756c462c66d9be956279b46ad.hip | 138 - ...63ee1b087f6b504a3dd3972b96e77db02b0582.hip | 138 - ...cfaf0d53869c373f6d0ec821b008dbb819141a.hip | 138 - ...d0eaf9399c863d672e8c08d123739bab837d4b.hip | 80 - ...015f0d0a7a5173810f6f17c00065e03fc61a89.hip | 138 - ...02e84359b2037a29efd1d6ce7213ba7605ab25.hip | 80 - ...1b6eda4f250da059fe0c428428219ff5a250ef.hip | 73 - ...2ab428503e8f8bfa78c8cb8d9afad9f5185118.hip | 138 - ...376ac8d82db1bc25fa273a80dfbf8b71ee5e2b.hip | 138 - ...5a5e40f6a66bc5292a56e0097c69fe37cedfb3.hip | 73 - ...87a1a9933239270f44b1e08e1cf5323521c089.hip | 138 - ...997f79435cf64add10506acb97d0647cfbb3d4.hip | 80 - ...b34d3cb673447773f6da23e9cf52b98e99f718.hip | 138 - ...c3425fe683d35dc3335db77d183ad1620b7a92.hip | 138 - ...c6c405cefe204824e8fad1b3dd34bba87e796a.hip | 138 - ...de1bc135191f3c2aff740f4c6bb7e98da42f84.hip | 138 - ...dec99707511cebd9188d216ee0a148d729b470.hip | 138 - ...38dc4f65d02776875627cbd20a9c794d70b043.hip | 138 - ...3e295b68e807774ed31bb914e4bc59312a77d7.hip | 138 - ...6aa150611b0d4800470c1493dc907082a5c23f.hip | 138 - ...81974c8b6f43f60d0af29c350d850b55c03121.hip | 73 - ...9937be2b9a13d6520fdcc922e4e75c9fa085ab.hip | 138 - ...9a22c6efd8bb8815887325aa0b739e260cc754.hip | 138 - ...9ab718fa23f24f09a713ac28a339208a7a5802.hip | 80 - ...b440ca9a5196ee1e72c878c87d96934e9273c8.hip | 138 - ...fcdea177734366d3bf283317a65cc3fffda611.hip | 138 - ...fef330a975002ed15670e8e7b26a10376d3cb7.hip | 138 - ...4f4cdce32189065362a502105c31bd2d9d99a4.hip | 138 - ...e2da8b791d31f4ba05ef5f833fd6dea9e35f1c.hip | 73 - ...568e11e44ce70924d27e683190422cfae5c31d.hip | 80 - ...af2bbfac25de2853be344b9f636226c1c0112d.hip | 138 - ...06d7803d06ef8aac1d5caac9f36aafd47653d5.hip | 138 - ...0dce1a17d073259250ec0c87ade69e639ffa8e.hip | 138 - ...dbfaffc8a9b573f194f9c63f1175d9725f8950.hip | 138 - ...f6461673882d636772ae4d26e78eabcb568f31.hip | 138 - ...19b8ed877d4244d01a17ecb948b459e361ff24.hip | 138 - ...21a4790f982d48bcaf950123c699647afb739b.hip | 138 - ...312d7159369d13f3148a6f0882dfad6921ceec.hip | 138 - ...530e20038eb40c49bc8b045be0cf4e7e6b4eac.hip | 138 - ...77735a36c325706bd19a12df66ed0839b032b1.hip | 138 - ...ad71883a19b522486706d3705700c012a6fc19.hip | 138 - ...ba0a3369d4e4eaea1c902a90e6501f232dd57c.hip | 138 - ...f1e7e478a2208c4d32e2d7e6abebdc16bcc5fe.hip | 138 - ...f28230817c9d9805c41dfcd4e834fe302e1df1.hip | 138 - ...fb8343e623e46f01893a2b61345d1ca5928671.hip | 138 - ...fe51f982abd60e567d4238d3266fb60e45814b.hip | 73 - ...00cfdc5592b7440d72482a18781e9cf3afb05a.hip | 80 - ...1992a2634cd6674076611be54197c715ad8271.hip | 138 - ...3975efd767ddf7c12e308d948bdcaf0968493a.hip | 138 - ...3d98ff43fbb80ceb82fc22ab039bee898969b0.hip | 80 - ...4c6ad28aff1976c6dd36974ec3b339aa3090e9.hip | 138 - ...5681d4e5871aacef74bdba9e368445875252d3.hip | 73 - ...920c3239bb5796b1ab2fc75177eb3b820aa784.hip | 138 - ...bb7b12cdd9b8b522af577e13232b2459dbd38d.hip | 138 - ...e6c7efbfc831e2bcfc8c1efa1a486c02627cbf.hip | 138 - ...ede7a18f3e3d5e24f6c70392413a2cda16ac15.hip | 138 - ...10303a0b79f2710eb7c66896d3c1f8b12c04dd.hip | 73 - ...1a0ce432c27f4cfa51731c3ef181bf60c8a727.hip | 138 - ...1b91c16e0255fe7a0a85638b98d94634e143a9.hip | 138 - ...1deea4f4fab0db31d46a91228601f0c272d6e6.hip | 138 - ...20538073888bdb3174a8e9c32d7449072aa753.hip | 138 - ...3d5273945c5d40cc05c2660af2df1fb7a15f3c.hip | 138 - ...4576e8ea5d59d7663f3760009a00a19e1b0667.hip | 138 - ...d571f4fe576fdb17d5f75a558cb6747087c7f2.hip | 138 - ...e5a98163e878c7697e554758ebd0597c2c1760.hip | 138 - ...f3e4d4d4837a0cb33b78c4f2767b1d93da0850.hip | 138 - ...127a63d56099e08125b16939dac82f0173122b.hip | 138 - ...4ac5a18f57f2ebb65f7e356e858ab0d59b2133.hip | 73 - ...54b107e1b557ea36b5cbaf7fe3dfce05415c86.hip | 138 - ...ac6c0e61b65c9422c7f30fbd979031698370a9.hip | 138 - ...d0b777df1328bf24e070ed4cdf8615bb2199fe.hip | 138 - ...0453a5c3828c1358360f31f5d3b7258e17fdb9.hip | 138 - ...4efcdd12184211c74e7b3f2f30fecf1041ca32.hip | 138 - ...757a8bbeabd16a44d149ab188430f6d79ddcaf.hip | 138 - ...e0582e1aef74f9209de638b553ec0671476258.hip | 138 - ...4714e4f33340859c106a3129993e22652262e2.hip | 80 - ...5064e27ba427cb951f7e1b01328b0beb6b2b7c.hip | 138 - ...5ad502dd40353312d561e9f40aa478c16ef5b1.hip | 138 - ...5b5932f6df9a194ceb0d69220fba9596528eec.hip | 138 - ...5c161b725becf059fb4439c668edd454ac77d1.hip | 138 - ...909cb5f96a4884caa0d2eb8c5e6bc7fa352797.hip | 138 - ...b9544e2a0caae2c9e3dd8bbd2c509e8dca1379.hip | 80 - ...e81ab2e2678816c7b516d2d4c50e8cb5874c68.hip | 138 - ...5c6c0bfaf98f6e655fc443246b81fcc730fe97.hip | 138 - ...73e1fc0015094861ca0c1c81bacdbe0c5b8f37.hip | 138 - ...da56a4eb08b803332f25bda6209932d9624acc.hip | 138 - ...ec97bdfb6fa95e057eaf5a8138853e1c0884f2.hip | 80 - ...0f65bc99ca08eba66564d34f72f2769bff9491.hip | 80 - ...36096f49a89730f8af7e75457c88cb8ae64165.hip | 138 - ...49a1b8f4c1c6d37973ce38593efda1de8ce0cd.hip | 138 - ...4dc4ed02eb42c3fe303342801ed3073a0dcb8e.hip | 138 - ...6ba4c996570ddab77b6ff1e2a0101b638543eb.hip | 138 - ...863830fc5d43dc6d6400280e892bb7de2892d4.hip | 138 - ...90b771a4f9750132f549c82a88b4ab00dce5c7.hip | 138 - ...b09e8513646fbb2a007544a63ec9e2b04dc4c2.hip | 80 - ...daa59f5dce6fc3965193ae37d8c82a3d1834e6.hip | 80 - ...dd0165ee91c095a19ceddf08789e3576912590.hip | 138 - ...de618ff3ea9f67b90f2227fb7fcc74ea34183d.hip | 138 - ...f63cafbeb445408c884727b473667fb479675e.hip | 80 - ...37b7b6e04e1caf43a62bd6788a75361cfa98f6.hip | 138 - ...840494c4fa78ff399c0399b3ad7ca3d22d4587.hip | 138 - ...8727988e47264b42b4153dc82fc1a750f08db0.hip | 73 - ...c0dfd19a08d61586758091370acbdc6f267017.hip | 138 - ...c25cfc437d8bd803860e39a45b2f3b9fa48393.hip | 138 - ...d3eacc320104100bce46235fe656e5a8223c66.hip | 138 - ...0d45aa85c0daa299da98c277cee826fe67bd27.hip | 138 - ...57148f457557ea80ca56690e525db3a4b0ff55.hip | 73 - ...5ce4b3e9cc392ceafebc7fe3bcbe05aaad4bbc.hip | 138 - ...d08c5470a385d0160b2c1441fd1c30fff1c17c.hip | 80 - ...daccc4b3a0f90bff39cb4597f8b7e484613d9e.hip | 80 - ...dfdb42c1b380e860aa5609302f29698dd27923.hip | 80 - ...f4b869ff23874b6bde0aab68c419108b7e69f4.hip | 138 - ...2c64ef01aa228277d031a74df51363f98aa2b0.hip | 73 - ...4d6cdcd81a456125ab5e0875466c6334d8e5c8.hip | 80 - ...4fcb56caa8f80404789fba0ffac447483a4d84.hip | 138 - ...784fb4c0685d7b651f4113f3c71e050881f3a5.hip | 80 - ...a23ded424200d0c6f06b1dbd0a7b7b0e7b5d9b.hip | 138 - ...a2edf232786d458e2125f8dfeda8847f842afa.hip | 138 - ...af8763f289dace1054bdcb4dfeda28b0aefcae.hip | 73 - ...fce1e11aee2273620e75efe4aa0390fcde9ba5.hip | 138 - ...0569ae9dbd693c0ab3d6ba69704d31e451011b.hip | 138 - ...1b6a64dd181f2efa65aaed03a3d229b3566c1d.hip | 138 - ...1cd6b60a97e7071518cbd1a63abb8b910df024.hip | 138 - ...3715cce8935439f90172d141050d78c7e76fb7.hip | 80 - ...605b2ad3e3753c5f255678abc1690b949c5abc.hip | 138 - ...645b713821371161a9925dec8a3d6c157ba1aa.hip | 80 - ...aff499ad527be5fe33b8e92547df57af26d40d.hip | 138 - ...b99af9a573df50a27fccbec3fa8e350f1854eb.hip | 138 - ...c9f975891087e6eed6393629b41155deafc509.hip | 138 - ...0ac8e8a03f8e7ec2c6e993dd39f09f465dab57.hip | 138 - ...4ac01458df3f240e0656d82330f9de23ba9651.hip | 138 - ...4b3731883a5f8393d60d27487f8d017aedd3f9.hip | 138 - ...e82799f4452e148c3e02acd6526cf30757eb52.hip | 80 - ...edfe3e3dc3008b928c8e6dbd50784b905f189e.hip | 138 - ...00779c17b7b21c18e1308e6d765fe02a7945d3.hip | 80 - ...149eea92f2c40c11de3b778102fcf9b6a006b8.hip | 138 - ...23b36cc3f56d1001b2d3abadd8a5628fefd014.hip | 138 - ...3c8c746055851217a514321cd735eaf6937263.hip | 80 - ...4b8b52f4a98801e185e2f132b2f80c29dd0c37.hip | 80 - ...6b79c4ebdcfd239cecec58203606bc123bd6bb.hip | 80 - ...6c30148a6fa816937f2f095802264d3dfa0273.hip | 80 - ...03eea8075cacec4d41fee7dc4734f593ee79e8.hip | 80 - ...12f23ef88ae5d7b161d36f42d22a5ba53b6354.hip | 80 - ...13fe25dc90b3511fc259cebf463376dcb55d84.hip | 138 - ...145383e39dec0e346b5094401acf85ef3c2075.hip | 138 - ...23b191785c97d284675f700a7baeb52a2eb791.hip | 138 - ...290cc4c3036c9205e689cbcc60e7d16b97a7d6.hip | 80 - ...33f4c03e338ea7c6d8f759c1132499bdcea059.hip | 138 - ...73df9ccfc1ace90fe3afb5c00976deabedf6f8.hip | 138 - ...adde8780b39f1364c572a19c3bfb19417678e3.hip | 138 - ...bda8157fb27d544e049fd7d2ec735725f1bf44.hip | 80 - ...fae2c18645d36a181a0bdd2d8ca7a4ac0f6d1d.hip | 80 - ...2773721479613ad72e334510a248f1436b38d6.hip | 138 - ...67098db97b3f26e71a151c63b74260bfab21f8.hip | 80 - ...6e4dcbe9c4cac8f7c8c5d97ce384ae0cbdbfbc.hip | 80 - ...901a63986cc28ef24cab012b32114851a8c1ec.hip | 80 - ...061c204d8a85c974676f4438994a0be9d69a60.hip | 138 - ...24ee32b178b6bffa7a71603d6e2818f66177a5.hip | 138 - ...37609afa8e21a761dad6b01ff3f26346e450fc.hip | 138 - ...5835bc6f000d3a3379bbc38d90e83dcaf867ee.hip | 138 - ...92eab7de49033f5480c5e86a69e675db0d2a19.hip | 80 - ...c23b7f8fcc4e4f4c81f5f00cfd345b98df2e0f.hip | 138 - ...c3e27b522320dcca5ee84fa534b03aae2bfea9.hip | 138 - ...07d8b5666423da30a95e3b2cabd3839d200981.hip | 138 - ...29a515d14dac02066bcd4701285b9916b43cf5.hip | 138 - ...6afccdee4107507a64323e17bf12c46da2b92a.hip | 138 - ...74887afedbd67928fe4d596709f9ff92530611.hip | 138 - ...822ea727fb3543e445e4000f7e6ebb946d6a3b.hip | 80 - ...9f6e1d59132fe96709490af25bd794f267851c.hip | 138 - ...0d0cf55d90b3f3c9eecada1db93c420f34b1ae.hip | 138 - ...5016bff9e5dc37184d2b9417eb351c7ea1c322.hip | 80 - ...85839ee8d464c5a81b8dad9839f5e0f4b467a8.hip | 80 - ...8f0bd93b352d28c5b6d78f4332026993f0bea4.hip | 138 - ...ae1670fac6812b2d2cbad973e4b475509ea504.hip | 138 - ...b06b43d5d65429e23cc717448cf1fffb0cfd74.hip | 138 - ...c4135fce01e8731fec7a78d0cc0fdeeae28b90.hip | 73 - ...cea8f7b5930abf76eecefce92d0db785d2df5d.hip | 138 - ...de2ef18e2174ebe13a6e7c8c2a6b05a6612047.hip | 138 - ...039d422a57c159ea4dbcc867d766ff1b356a07.hip | 138 - ...08afbff5def8bcb4e823657ce01f57c9dc77c9.hip | 138 - ...184767d723f4995791848cdc68bd948408204f.hip | 138 - ...1a7f9b1afeba6690fdc0d0d1755ea89c805573.hip | 138 - ...34b6ef496d4e0d8fbbe10731d4a7b1c136c036.hip | 138 - ...3d625c5ad3e871f5a727ac946df642d988b9ab.hip | 138 - ...4d27535b9570b8f4b790470a83c1d0a9a2b6ce.hip | 138 - ...5ba6d73f331c76e696953606c5b347b6a46f3f.hip | 80 - ...62a8db637d32e7dfdb2521cbdae6e1fbbd5fd1.hip | 80 - ...818f3ce244743cb1dbff9aca399df90742a6d0.hip | 73 - ...91797c1474a368e9cb056b50b4629d7736c3cb.hip | 80 - ...9e54273c0ea2358fb573a7d918aa7b09fe07f9.hip | 138 - ...f815ef540060cc7ed43e1c57a28e1d080c5621.hip | 138 - ...10bbf37503bbc92af82bc3487989b41b20ca85.hip | 138 - ...11806cd2d3ef1127f676b2d98bf8fff2a1e5ab.hip | 138 - ...35634440edb25cb095800b882c70aaceca1dbb.hip | 138 - ...67d442001d2b167e70e8730abde4d4461b8569.hip | 138 - ...9494d9ac35eba6794a4f9120d2db9932596ef8.hip | 80 - ...a8d021381083bc48b7fb1840729254dd8e5137.hip | 138 - ...cb1cfea1b0dbe50a02252cba99428fd977527e.hip | 138 - ...e93ffe7fca311e136e42fbcd12b05c9fc7174c.hip | 73 - ...f5339054f47d9ed6cc7f9e66ab21ce3bccf3db.hip | 138 - ...1ff66d2aeb47d2fdccaa4bb6b9d066b380c99e.hip | 138 - ...26a187c4db06115072a5132e1166b5b03368b0.hip | 138 - ...36bc309877917a18fd21acb30563c7e2f233c1.hip | 138 - ...5359f0fba3da9dfed06ddbea8fe2a33a9cf40c.hip | 138 - ...6683d175affaa5ff261ab8503f64172d8eba8b.hip | 138 - ...7eb562a7eff31d589e12945d80233aac202ae2.hip | 138 - ...85901d66dc04b1143bb6404445baf65693b781.hip | 138 - ...b9ec2cccab94920e40f62a1f0f094acd919d07.hip | 138 - ...0b2bcba57e77d975ec5304fc50cbd09cddf4bb.hip | 138 - ...4bb75ca79f805a81fbad750ad22f6d22b0d8ff.hip | 138 - ...4c9eb48da49a61957537270d94e56cb4e426be.hip | 138 - ...5b1c6758d4b8540158299dd0362297083084c2.hip | 80 - ...645b3888dc8d1df50c47c0d75822eebd3eb019.hip | 138 - ...66feebc9a0dcc508ce002c255154622875e524.hip | 80 - ...cd68acfca68d1acac94f493e25be0ef20f209f.hip | 138 - ...2a198f23c409b715761b702d7b0e6e5992701f.hip | 138 - ...35773419a9b3631698a3d375d829af55f7731e.hip | 80 - ...88f0f7363804cf5403adef70828ab32d09a02a.hip | 138 - ...966fa1ff013e477b1706928de6cb7f8587c154.hip | 138 - ...9d9baa269dfbb30b714389d1733be51cc419b7.hip | 73 - ...e48d7edfe9513f24ad9fae68cac3aa940b17dd.hip | 80 - ...0f47a44400de385ddbeb99475b717c5646fb41.hip | 138 - ...1a3b7d4fdfed64e64f7a95dbc64eff541092d6.hip | 73 - ...3b86fe4e153e0bfa8d1e75f3641fe32b0c5149.hip | 138 - ...6075c3a5fcfe63ba12e854bb1fed6873f014ab.hip | 65 - ...6edb824cecf459a8ec51b8dc74b1e06369aceb.hip | 138 - ...c1a31a1d8556cbe0b6ea76faacc78855108539.hip | 138 - ...cc934ba7baab1a2eb062df1e4ee5066e9ffbc3.hip | 138 - ...d85ad2c9d197f501267fe0804e6985802fbd18.hip | 138 - ...762543d3380185e304f84749a70db1b8d3dd8c.hip | 80 - ...8fd64c2f2b27577109a984e6ab82f5f0fcb296.hip | 138 - ...b629c37cf94134693ce455b8c88b72a39df7fe.hip | 138 - ...bf6805a489739abb77c13173d57723e9304afa.hip | 138 - ...c9f955f227430c6224ebc347649386be7f01eb.hip | 138 - ...deafd2f36cee29109fb824e0135407453adcfe.hip | 138 - ...015c5d50481547aa5754d042d9d7040cf1c7ff.hip | 138 - ...07a1b0d5a8f94e0a0f4032f401d20b4b643523.hip | 138 - ...34e691714f0b99773c2ac515ed82de0f387065.hip | 138 - ...4b7e452a4db74189334697e3a240ad68085f0e.hip | 138 - ...89d0e4442cd8304081892ddc75043e68a6398c.hip | 80 - ...65193d97d43237c22c04478ca5833011d8dc8b.hip | 138 - ...77abef05ff37ec27705eda51896e2aa3a04966.hip | 138 - ...d9a2396ceccdadab24602f30e9070901a76dc7.hip | 138 - ...02730dea6987e2c038446c448aa08bdcc23113.hip | 80 - ...14c6b4bc75d95a150104a17972abae77cb47ed.hip | 80 - ...2e3053f30f780f346fa6b7a836ad2554cb85df.hip | 80 - ...6757fb17f5e94a6ba1fb14540a68c36d571159.hip | 138 - ...78ec9e09d3b78dca6b5bf0be1538657f02f319.hip | 138 - ...935fbda313d3518f142f43d46f56c600f69286.hip | 138 - ...b2bb9f8466de1ad5210e4c39ee7b8ecacdffa9.hip | 138 - ...b65fc519ea7cfcd19f7eddbc3acad6842ff558.hip | 80 - ...c5079636a4a31a849ce8a5af89d50330a74628.hip | 73 - ...ccd5f7ddc894b2717112cbfc766804e02b7bd1.hip | 80 - ...18fb4e529104fc90069c8779ce5463460bd516.hip | 138 - ...38053e01268a4c5883620fc6a9901951e2e01a.hip | 138 - ...39a1e84faa98477b05df71d363b9ff0f9b2760.hip | 138 - ...8a9e05debd456a9975953f7b0d510e7a0f6978.hip | 138 - ...973d75297bd2c3432a7c88e8a9ee1c9ae693bf.hip | 80 - ...b53fb8d81148ff384d31a703bb4c2e7a5a33af.hip | 73 - ...e0ec1db1ea308e226f675e68e29b839e41b252.hip | 138 - ...e6b10e73733716e71ebf5a53703fb935fc5e02.hip | 73 - ...153f9a9b0b7c54ddf2debbe297efcffbb4fcfa.hip | 65 - ...3a776ae4ba68c23acab1a5a6381684051738ab.hip | 138 - ...5c757c67aa23cb88e1aced6fcf36b7b28391db.hip | 73 - ...5d492ac3a6ab75648056bcf26250a4aa929cfd.hip | 138 - ...6879f8ff4796f48ad87ff8003f4f6e6adca9a0.hip | 138 - ...ae1294b6dea5c8b93c2b814fa7460c4047105b.hip | 138 - ...b2eb64b66d46359fab44333c2c484f4c9dd5de.hip | 138 - ...c0a99e949baa5f3a7ee2d6e84427982f82f76d.hip | 138 - ...d37e7ee96c392fa24c02a9143438a3a7d05741.hip | 80 - ...de729aa50c10d8101ef504138c3769e3286753.hip | 80 - ...3c604d1b8260958becd1c7c209745ff9151715.hip | 80 - ...9bcea4393593313d18a4aa6dcb44cd75bc828d.hip | 138 - ...a9427f34bbf5ddb28a39161acc36806e68f2d0.hip | 73 - ...d8fe5f4f8641998b8b805a20b2ca92d019ee59.hip | 138 - ...d9b65558398c0c10127b560807578ef117d7ed.hip | 138 - ...07e8d1089557dfcc95a05160be5092e9119a53.hip | 138 - ...5e3908479965856843317c8b0c42a6961dfd23.hip | 80 - ...86d5f8d5591f3e0f1cdfad19c38c420fd93023.hip | 138 - ...b04e6d5527ba0b8089ba8bdd264e2d5759338b.hip | 138 - ...b53fa68641f45baabf40b7cfb8b35a9a1b9c7f.hip | 138 - ...077e68dbc1bed2dd20a5f4dd35e0cad6330ee4.hip | 138 - ...591185b1c5f521023e250a26f742984255b241.hip | 80 - ...62567e9ea16771d8445464c38f5a2931cb355a.hip | 73 - ...6a6d4cc262ea838dbb83ee747112f95fa297bc.hip | 138 - ...b6cdc59bf216f7045f0cf5f221bb91ec415cd2.hip | 80 - ...c353f963c52624cf79e82cc2b2c02eed94b677.hip | 138 - ...c5952f46f4f2bf06257b00661774eeed48a323.hip | 65 - ...278488b2cca114adca5e4614d86f92447f937a.hip | 80 - ...b241b947a0adfc8e50c5d71765c14af24593ae.hip | 80 - ...b9abf5b09e63cbe76390bb46ff7cbefb3141f0.hip | 80 - ...171210efd217c07d357fcf42e5372ad7e9abab.hip | 138 - ...3deb1382003ac010d9bc1c59d1878d3ec7a727.hip | 138 - ...51d24ab5f24e003ed6751ae8ae5b327892b15a.hip | 80 - ...7ec8d547ee9713aa3b5b667f22cdcaa8f62b2d.hip | 138 - ...7fc24902b1ebd8f2bf8088b0ecf6de8be8362d.hip | 80 - ...9f63a538940e5ace02ae5b5ddc01f730adac4d.hip | 138 - ...a613eaa8471ad7da66d2f8f2b8e07f6e02b467.hip | 138 - ...d7dec90b3c62bf3a30bd75d3c6869529a06b01.hip | 73 - ...e60111633db08f765b3c7cd5cd768cbd030255.hip | 138 - ...37ba962e0288e2840eb0925d016b5a7e3b3164.hip | 138 - ...6bdf67720e938d538a867548ac3579b8238169.hip | 80 - ...e81dbc4cb208ef6e684c76ba1eb451d37fe10c.hip | 138 - ...1a43f2210a8d1e5623411c95c33424cee5e747.hip | 138 - ...239db5a67c23a383590a651f0d8a0be43a13c7.hip | 138 - ...8e709eec7aef1fa681053c6d2969a5ff18c45c.hip | 138 - ...974931e65d6b16b7c868d462b95dcae20b7513.hip | 138 - ...b0e96b759e18cf703cfab0cda1385726f6e0a1.hip | 138 - ...e408cf9456ff977aa7d12345e9b2f1e60639f1.hip | 138 - ...2ebb4a86e7ed0001de9c5e607b66fe8877409f.hip | 138 - ...40f0acf1885096efb840ec5600ec421c4db331.hip | 80 - ...5421703cbfa63a58ec02701e245d479a1fbfc1.hip | 138 - ...7cc2aa1ffd38298b52764a93cd1271b4d92f8d.hip | 138 - ...aa0cb33c71cb8ca7b83dd0e7a6c7b01f6b50a9.hip | 65 - ...b9e7d9af47cdf79f15f674f8976c05f08b0ce8.hip | 138 - ...c6a7b25710f0626c3af534111b161e1459d2e1.hip | 80 - ...1468c62c878295443981662e037ec5213cf7a3.hip | 138 - ...20134822739be6fa0bb3d98e9dec79f025324a.hip | 138 - ...209426a8e6bfeef7d8ae7b16db791888142298.hip | 138 - ...28af9e5e3c25800dde938e991aaab4fc1d64aa.hip | 80 - ...53c9c32518b895daaa3521827f37af78836fb8.hip | 80 - ...69b38b26c30bc770f74c856e47eb498f5818e7.hip | 138 - ...cad48d9bc80d58705ea60eb2dda4baad68cedb.hip | 138 - ...246d1013d954a9316f4432c986d3be9459c548.hip | 80 - ...2f1f1b679cabab04218037ef370d2c7e1fe332.hip | 138 - ...5c41ddb04ec7f80235bb3db19198dd6b699713.hip | 80 - ...8c74becc24a93427d9c0838784e9b6caad6e81.hip | 138 - ...ecc90ad7b86791a9e6f73a582aeff30f393804.hip | 138 - ...1596e8c608a795ff971aea8e199db9e72b65d7.hip | 138 - ...4bd5b92ce6bba640b8ec6b4e53fe35902c5572.hip | 80 - ...4d42e820adc1a26a428d59df7ffdd7f8580176.hip | 80 - ...4f26e45d5cf567d29fbe375fbf8abdec39186f.hip | 138 - ...5b87c435bc5d7d85d738f3fdf68947d79f5a77.hip | 138 - ...80e1639680ac1e5830a21f921bfe2cf364ef42.hip | 80 - ...da112b1e07c44fc8a7f19368da203f6935049c.hip | 138 - ...0316cfe49323638f71ba688dd8ff9b2266b335.hip | 80 - ...193ea266f3718398bc5622f8bc7042c3527a42.hip | 80 - ...4fdb8294257d951dcc9c4fa7ecf1192568b91b.hip | 80 - ...6aaa63ed42a578b953ebd614318d44cf44e8a3.hip | 138 - ...95bec57c3b2e6e169134dd8d20b287d7405134.hip | 138 - ...bf7ef503bb026258b3ec3d82d3ef1443046964.hip | 138 - ...d0166931e4406873d8f552a5d5b61fde2391a3.hip | 138 - ...fd08d56f8a9be1a8dd104cdb1ac58e283b5064.hip | 138 - ...ff73f82aee3184849d04c2364eaa45c6d0de9c.hip | 138 - ...2cf0e5fe479690883507028748b0cd3dc83cbb.hip | 138 - ...658c32d562f9d60c5ca1262a2e0df2375063bb.hip | 73 - ...8f8b681a405bfeba5aadaef40f32367ec5cd2b.hip | 73 - ...900c0a5c0d03dc17d7a907ab40652d9920e756.hip | 138 - ...a6438394dd3427f29aa0bbe58ad1f797c3c38d.hip | 138 - ...b87f983a5e84582efa1663f84da76cf60b5f6f.hip | 138 - ...c803838f5644ccc6f04f7c8a6233fed0b6639e.hip | 138 - ...df1cbfbaf67705820f125b474469ad7ebab0c0.hip | 138 - ...0fa4ea674a590d0a817367ad9915a5fce20c51.hip | 138 - ...1f1a11f778d99a00aa5959a3e58a41fcbfb1e3.hip | 138 - ...25b59df454ccf53da6cb201e0aa8d09f52a2ad.hip | 80 - ...7f84892e2a8496169b7406e63b0d4f5aa63aaf.hip | 138 - ...803aadd93e33567aa6b23100ce4fbb6c040dd6.hip | 138 - ...f1797f6b672a55476348571ce17645c8a62869.hip | 138 - ...566441ac3074578cfe45758ba0583c0da0a5ab.hip | 138 - ...72bf80a78885428b2c02e522426470653a7351.hip | 138 - ...82399cd6412fed6a1141296a7e4d42078f7b29.hip | 138 - ...856ca950bcf173571766c3f04de4163be0402e.hip | 138 - ...9548d6cced86c21c09c6475237a0cb926df0ed.hip | 138 - ...9878f4ca8cfe6b8d8748766f66a1ef8eab20ad.hip | 138 - ...f102a388ffb05c690a20a29cfe0b35a35eed61.hip | 138 - ...035f4bfd8f2f427720a07e3c311bccc1dba683.hip | 138 - ...1f96ce4dcc7f789a8ace73c230c203b05ff6dc.hip | 138 - ...27911254904ce4341e4ff5f8bafc430b8cfbbf.hip | 80 - ...31289837f915e2aec1bd01eef1b3c1b099864d.hip | 73 - ...9def2b4edf6d18f6ef1d6b141f9e0435441f6a.hip | 80 - ...aa9c39b06e55bf4bc9f9a2a0fb075c9d4e69ce.hip | 138 - ...cf08242b3fb1c643d4149bec985b667b9d28fa.hip | 138 - ...51da732f397624717160f89271514bc334b59b.hip | 80 - ...61d8693f82d22e2c5b1abbcbae5f30f4433e5e.hip | 138 - ...7790f260630f312b84888dcbdf849ce130ae59.hip | 138 - ...7991cb7787a29d3ce4711b4ce04c5fb6a14ca9.hip | 80 - ...0410c26d7649e21e2ae5e32e7af89d84d2ea70.hip | 138 - ...2e9a82c879051d6fe3c42108f8a574187704af.hip | 138 - ...3bc23b8a4f1e0fc5c5756c4e1c835bf59dea09.hip | 138 - ...3bf815b520a9d9e17b43bf9d7fb870751b6225.hip | 138 - ...74b12e83e214c30995a25631d37df1478927af.hip | 65 - ...824fb32933b27501ae8a7f43f460a2dda6a814.hip | 138 - ...8a6b193fec3203eaa75819f6b51aa45a48f212.hip | 80 - ...c58761c927b222112cb5cb6c9acb5d3c915785.hip | 138 - ...16fa84278b489af253b52839786f94aeeac36f.hip | 138 - ...62a97675719c2e8e9bb97361b92ff1c7b9d2ef.hip | 138 - ...85f869a92f0482605e52019828244b12e12b44.hip | 138 - ...bdc143c29d5ca50ab1e96a814bda6d05b0d5d2.hip | 138 - ...c5a0f98b94530befd634891e42c424bb86f0e1.hip | 138 - ...c99c3c82b77946f6844699d2333cd532a78a26.hip | 80 - ...f56e45b2240515e97fc1bfd552eb03b6de5094.hip | 138 - ...f686067fa433cea5e95dd523846dc881eff635.hip | 138 - ...2fbb135d59028afcf867c2cf08edc323565528.hip | 138 - ...4c15452f9155c5966990f09432e5eb7e28e785.hip | 138 - ...4c5f8fecfbbe16e6648becb3b5ca89fa3d8a94.hip | 138 - ...5bb49928ce5515d7b297d5eadd4ec70a22d60b.hip | 138 - ...79e1f9231692d736dbada062ed6821f34927bf.hip | 138 - ...9477a613665cebcad781389ba7c5a36f51efe2.hip | 138 - ...a36678d5047ded97ee7a7ba9feb9569afdb6ea.hip | 138 - ...a47fa8d9b5375bc408af68b67345ab9dba2eb8.hip | 80 - ...ea85b766bf0c918ee0baf24dffc6a5563d5105.hip | 138 - ...eec221cd63adaedceec39db41ea942f99f5133.hip | 138 - ...030b61ae20c4b7d9b2d10930a17e01e9e93328.hip | 138 - ...1790325b59bd44b0a5f6cf9723a25fd845cba7.hip | 138 - ...1eb85a00017efdc610e4259d2abe935b85304f.hip | 138 - ...5841a729099340d608e31023acbeaeade3e886.hip | 138 - ...5ebf0f2200f37ccc0849e0c3745f6e2f00111d.hip | 138 - ...7b0916744b593435d8e1e7b6d874d760cd5e3b.hip | 138 - ...86c13e933cba40553ffba31d53aad27415ce4b.hip | 138 - ...b0b08e29b2e1bf181fceceb9dc416e54f52b00.hip | 138 - ...b6ef39c3db49f26f736d6c9221dd825409ec4e.hip | 80 - ...be827108d252b2f5847fa8e132c9c3e56a90a0.hip | 138 - ...cabea88b8e290688c1b360875d228e6fdf1624.hip | 138 - ...10a3b937e9659716925e39a01d794914b08e26.hip | 80 - ...19d7614f2ed5da21a52ed172ef62cc07c9c01a.hip | 73 - ...26e43ca652e6f58ff48c356165aa4349833b55.hip | 138 - ...345632e0cae0d549ba79626a08b1885711deb6.hip | 80 - ...3558b4c7a667dbc365c4c2ceda646975408f51.hip | 138 - ...614df484b263deae3b3c20adb0ce7b62eaa651.hip | 138 - ...9cd1305633b62b68fb8474ce021f639f8492e7.hip | 138 - ...e12cd366d6850ce26afce98e5076b695b4875b.hip | 138 - ...245e9ea974adce2b9807d33b9ba12d916eaffb.hip | 80 - ...72cdd69944d2d765478d4aed13066a02b76f6d.hip | 138 - ...8b8c3525fe86a20a2d6c69585f3e36c16caabd.hip | 138 - ...97b7adcd67ed9bda8831d1f3f1ca7590c6d251.hip | 138 - ...9d98dbec5096a89b116f85675af772f023014a.hip | 138 - ...b5e77111fe1e20bafdb83a925b5faeeb6214af.hip | 138 - ...cd7501265b4c4dcf015485e63e2324304f70d3.hip | 138 - ...cffa403b3631b1957e1a9a06f18fdb3b4eee5f.hip | 65 - ...453e3bdc9752cb7b81f7cc3056325a8b9a8ad4.hip | 138 - ...6862dbdbb20bc63a650e1f93e9ac169bb702b2.hip | 138 - ...b5b7349a671b182d73c8016590f26fe06a4cba.hip | 138 - ...b8adef0cef91a86f36872407fea35df90e8f2b.hip | 138 - ...c6056d9fe125a4dbe08c1d86354e51f7daadd5.hip | 73 - ...d868d49abdb769ab82c21508d655daf54b8a99.hip | 138 - ...f7aa57cca501f221077124359a589b3a6f9d0a.hip | 138 - ...fbfcac254e33926131a71905e93f9cc0aef89e.hip | 138 - 1810 files changed, 233848 deletions(-) delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00042c36bc588e60a7c8a9ba297a8a25d8ac0660.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0029076f83a3dc695a167beda6fe19230a2b114b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_006c417a52a1bd7c55e45d111483d26f4480caeb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_008f2429c678d13386a06e8d8b15c4b480940ff3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00a2adbe938d458d51ca5fc4020667a215b672a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_012c0f480917c329f4c3c6c666cf32af2d82b294.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_014c209d5cfc6b965bfd78c64bf132c0154e32be.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0153ec18d3ded0f8bdc6459ea5757ebd94d9faf2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ac1a2ecf9a487809e46faa92e267df2d47de91.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ca79005067e20e4eed5a72ff9187cde702cd1c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01cb354dddef6e99e4ac843f2adafcddfc58d520.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d12033d59ce2799a2a024e5d9232325ccf1320.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d3b034a2d8d0b83c0aefa4faac6c3f28ce737f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e2428c5447aa9a78f79f73f31cf685c586872d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8aedb7b7d77f44a46b2e9b7a826f245aaf4a7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8f0df0c54ce619e5b66441b3c96a5e18b05d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ee0083f6df962c4a754cd3295b1a436c590a0e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01f74764c3c3284fdd1b67d0ea781c2261ed0de6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0225857454eaab2eb664aef7a0849ce12c32fdf9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0237c76137df14fb808ade8bd6837045f2aaa5c9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0271bd8b7c270e1593871b638288a4923342c446.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02d88a03cd3966dd0cff550065f58c3ffecfff6c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02ff94e3c787a7b06ffc90c25777fa74f225e32c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_030a759dcc92028b4c6f317fc230b98cb929e806.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_031b12f9fd94e01aaff2c0da4f35f346822087e4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_036887daf6cc092e7422a17882488e59cecfb643.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_037c6c80fcec3eb8b0bef50ad6af6d27bf5447f5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0392491c5a6dfc742c2be483419a40f6a7a7ea56.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03a71615a088e972c998f9c7cb44566c268c5124.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03ff035717140f7385282419598cb4fb2881ce8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_041a0718891596ddac1fb0088637029233ccbe60.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_042a156e9eb935555ab14a84461959b466c2fb5b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04641230fe9a50a221047f7a1df8a370f72805b9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04c363e11d202c6d2f4bb753661c5a2043edc0ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04caeecbc01667ec6f5599358a0a20423aa9a00b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04f39b453505f68a5091f68b1c3de48369d1e7ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04ffca078cfab8bc6c4ccd1cc8994a1bb4a88ea7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0502e718337eab7d47aa65cea7d3c5f641484520.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0513b2f3bd8ad51315aadb7f63737201898adca8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_053981d9e7af2ebc0f91e61ac5e25cbe68c95bd8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_054fda16133a0d25077967b05425f9128e1fe1a5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05538339c21c92c53d237865d72debaaf2ee5075.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0595316f0dfffda03e5296b959a49ec3f3c48d67.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05dfe927fd64a564c5fad537fb7c41ee9c94c2c0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05e60b3ab7477f9edc8576a8bf43e3a62b8d5ef8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05f794c7023cbb7e35f1fd1ae45bd2377bfbc520.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0628931bf5cc1daa6e106cf60bb21fa1aac6b1df.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_062c8c3c1cf6c33af4574099e9b6ac54a55ad776.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0682150e93f547e00f13cd8984779bf49b91e50c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_069c663be0267c009be4814e9e4e7c13ec999411.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ae52ef937cc27c544e32025ea0dadb7fad982d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06b74acd9abfbd1c4ec2f4c718eeb92a0bca7bab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ba94794a14f0f0022af6f5f3c16e1e16959d4c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_071751b1012b90f7b57f8591cd06ae1fd27d9cd3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0766e7aa4b263a811408b285213e47176ee2bdaf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_076b3beb57b30afb30636f948e3989b346b38d20.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0789852b0cd3cc030c78b28f2fd5b6b0546382a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_078b96ad691a85eebd18586db0b62b8911016d9c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07c3fc96d2bebe546dce6ebf46e5c7a519959599.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07ff04fcc273e469737512893ea3fb5876ac131d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0801c56831b4c6428200db6318638a2129bb197a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0836d5dfc0f939ab9a4064b403339373caf35b56.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0842c4e3aabdf55405b3ce09ce1899245ddf11ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_085722b43cde5f37242edb071f639da7c4a0bd48.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0878b9aa31429d23a93cd953cc6a2fc5f43d0d3a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089a347aef8a920e3b59d5ffe71fc5bfe002609c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089de13222caec1483207d4a54249f8da4f9c151.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_091cb49c1958fb4342d79f367ea93cf2b472f785.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_093834d4d3fe76e1745e4482c6b51b550c6f3dfc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09513bff5c1da6aadf11d2e8272a422eabff21bc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096863cd93d1b105a617d0daa1d4f37d7fb6b893.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0968cebd81ade762c2f92fffc0153fa7a2b91eb5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096e888c52d0f4a5847d7515fcc66208b1ff40d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_097b3e1dae9bfb2e89398706508f8e01966fd4ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09d76cca48b71dbcc9bd96734787209fee4c9a74.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09e50367b62bb09071e28b44235a7c112645a706.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09ecb6347009f6a5d5530a6acf90f9f40288cbcf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a2b116fd5065109aae46ee547e4f49ad0e9d6e1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a4e76d89b175e1d9fd2e9fb908d5fce1ebb945d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a55ed15ef58c941e06dda890aeb530e28eb7bba.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a672fca51de618e3441cf8764e8e83eb782f2c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a68c2f9a3acdd787b81be455cbc7836c8bfd90c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a89417a043556970f72eebd48b4f3e7ac15377a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a92671b6ea99891c0d69b1c793f4d131b9a82ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0aafb881e34a3794970a1282af740b3f19c138b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ace6e29e1d3060c3086c08fe27b471e375f9c75.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ad9d68fcee021437e13ffdf94d78252205f5a31.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2647b5982405a48e8c8888552a4b89386ccdd9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2efefea81036641561bed80c75d77651176f74.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b3153af7bcdba33115a0d31f121fd76be2ffbcc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b532fcf26f90c82a792cde7943634f667c1d033.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b90a0186d8b8004e3f19886c7992c8e04d0e066.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b9585ba1c10acf67115c5899b3546608541820d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bb81407c8a2b3cdc5fecf655b3ad64d5d729cc9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bc7910aac798f0555e9e505ad7f177c9fbbd92c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0be8cf70c6be969ecfca675782c860b5b75ac089.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0befed50a89d80c22b2c8c3d5ba67d73c3d0190e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c32a2d9701e23dd930119c4ee8089042b5b0ac5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c3b2ec99fa7b09c7f78dcc3142a661d686044ac.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c8a0bb89a6f05289c0405df5126fa0cc16252e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c93c65e5942a2f43f2e491547add02777dd2eee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c9bd38b8f9009d932ec49204fdea39a52885246.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0caeedaa7d50f1741d618fb6c573529eebb075b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cdef49859c80c6b3ba18eb2fb4c35c72abc1cf2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cee6b9427c164d78994150305a47f73954a67c0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d0e0147a92061d32608a34e7b47bd534eb787fa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d13a4c8d169877da6408584dc1f20a6f7c5e3aa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0dde401aa76cb5425563cbbdb0362748148da3ca.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e007c36231ccdae12f102eacca1f74b0711b9c6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e0a2370f2a320484d8f9f21e3197425c2dbe9ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e1dbc9c433ce8ec33ace9e62550261d613db582.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e3f4cd28a4c06cc109f6a0798a77844bcc750b7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e661b5f30566d1f159f060c264849c7ae4772f1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ebacd06455ab20eba78b389462946716b5819f6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef309b923172f4c0fb38d9b9f5325b33b4877c2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef9b9413697d6f4573c6605bff6f58d027c5016.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0efdaa9266a5a464009297dc59db92504f8bf1a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f0c699d9c3b0ed62097e38ba05e40e815cf474e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f588dcb2ef86677ebf84e406eb802e9921d1f1e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbb0bef3b388867e75d7a8a187b8b4b650a42ae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbddf533661642d84bf5a16149692d5a892182a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fcb7492feb79e27e0bda73e57ef7dab410e2bb6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fd4068ea93fcf4df463e3bf3a6898d23b65da7f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_103186dbad604763008e0204a1ea90baecef8877.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1037f1bc50c4a65dac09ba56b701256b701c4322.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10a055e5c3d6a953d470db5dc21449766248058a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10c24f1f9009e46afa3a59193784cc2575f79056.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10ceed95b0a0a01f844678717c88e0426fb503fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1132b11429034d96d82c82dbfdb69e460ad8a564.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11e7df31541c3aa919e9825ad7dc4432f9a03c0c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11ff174ff2175e9ec22ac3a0fa59dd7713b79643.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1211733062ed30b876f1d63bffa642d77e258dd6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12207f4b6e7fac27d6c16493a5373f448a2aaae8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1241814f76107d74ed069ecec99a248676487eee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d5c8a4988efe60ef7943ecd73e18a28a736583.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d60c8abecb3bc9b84b0ea7851628ab17d8b0b3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131691f01cc7f29affb88152dd48c7a484315dcd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131c1fdc4206bb952b2fea675f24e3b09f605eef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_133c51948cf8584900807998da14d788039f53b9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_135ea67de101135ed5fe04f5cab1ec1d7b3714bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_137fa6780d9e6bde10aec10a875c039fdbbc652e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1386cd75411e61a8dbbaf2b916e62f4f5f99104f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13d5f2ec83b3331654e37ea0b44d88cd98abaa37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13f747525ad31e76c88774fb2208e470da9c2310.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14221590b90c48d3cf259fb4e834ccfaf7f3209b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_144f19363ef26efd36f0436cfa9f84f181a8824c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_146eb8c40e3146e06936f3141b2c4d92a578ddec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14baaaf1e90a075ab802c6e7d97c4b1605c8bd72.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14c4ebd1792c781d219bd21b691b575f64635730.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d11aad7b666f500f68b264a2fcca6dfc5f1a05.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d4630876785655bd4950566e81ae0b645c0d3c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14f77aeeafe4b28f314fde5ebccfd2a554872781.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14fea611f3c253aebf726af3e5fdb7e63e18e13a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_151a4425b411596c46c7032f6b83d3152a0e0cd4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_153e897098539c3466da9d7a37234daf16476277.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1552dc38d26f6badb7a9bcb5ce9124d54cc45ed3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155bafb551768855c8c01faa63e44764ebe6c110.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155c3549d067464d186a99b8205317cc000d4898.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1573e3d855d28c54af612ab950b081302891d56d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157768cd725813f8111d265cfdfea7f42034e5e9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157b89d8d625b8244b5cceaa4d3e5fc5a09c8989.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_158d5ce564c3ae1eefb54e3d41dde2604560ef4a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_159ee1f1b44d1a8fbaead65d8449413bb616d15e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15b255dde1a9d915e582ee2a83de7d83190c6a24.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15cf7068183421b141ed5d6e7fe902d06b6492a1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15dc02ea7e0908cf0bd48034f5a49debfaa36219.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15e8e1ab8c63db96843054bb7a98d708ae6a9c44.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15fe3e8f4add16a088fe44458353fa7c0c4f9658.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16047b5544acef40e39932672cac6f562e200948.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1621507cf219fe608715d4e5bb6e5764022e2d61.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_162b0dfbe3f615b1d164290799b2457437a0044b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_164a947a6c2ba83a5b1cb7074aee0bdac6c9c64e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_165dfb45658df8f1ae8dc0738ac9614740f2576c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_167f5328b035ed59a6f05dfee31edd704c4b07ee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1687ddf65ce4ed2997583e20fee9f201e86633b3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16f94f5c65c37624f5458c165daf83517d9e3c81.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_173c44dd85077e6b12dd06fdcf6b11ba349e1866.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_17b9b96edda151072215502cc2b606bf1f6f0b03.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1847fef2c06ea581b0ab31af1cb0556c572696ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_187963e1969301abfa61d06afc97faea2bb4efb1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1886d4bf54b3a4a9e093360998b2059b3c03d072.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_188a70d526394e254274df95de0727850820326c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1899e28aff2fb168cdc3af7132dd7fd09c2e1ced.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18a4d71b31c451a50df7996e3db864bc3c3882ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18b92b4e249195ac3e0c74d246585a4c9e0992fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18ed7195a9443c84956c3f32839cb3ab9056bdfc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1914250fce818584291c69a5f058a58cfbd83df9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_193699a5daa14ca2def07489e0b563149bc403f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19af6a7f9e5020e8d0f0ca0f6258001f6ce592c1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19cd9f7b08cec83736605af63d9fcaf463a1aea4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19df4e13108e043361e9528b71df56f04f696a0c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a11dd5ebb989503a1c182684e7f247e2f8cd9c2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a236be9da05a07d11cd28034d90cdf89941a172.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a5e18f6333ed2cce509f07cb8bd5868951d66a0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6785392af35e27d6697b584cb6f17a766d3fee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6bc2762b95d550485aa720edaf71138d94cd07.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a8da3e6ab050262b659c801ccf9a14787d7f176.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a96f0ac76f117e66eba97cb990c2350561ec2ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a98bcbe900f8c141136d18c114b02fffbe8bca1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a99b2625adffa8215276bb88fc65bae944b846b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1acf2f892742b1d236d2b31a8185c6869126adad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1b3e7c8969027d3316875f33dc50fe022e05ce37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be43f8b629e7039f57b95866d5777273377470d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be746990a2032f0363ad9f9112cc994983f4706.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1bf767e7104cfc8322f26df35907fbf04b8948f3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c1b0f85e085dd0769c566fb16aafe5ab5952714.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c2a2d78176e3f0a78e3ad78217e75a4430c0de5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c65ba6dba01da9caa84ba89453b61d81376763f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1ca3f45d0be2d1119cccd0af042a3e8adeda2ed7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cbf88db44aa5f884438288a325270d29c7a04b6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cc459e57bfed5ec7f40ea4a4dd9f72f3ad7a709.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d02609fb803ea2697e2c2cef35e6f923d2578cf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d0b822743e0205f60521d38d7c64f589fdf0f58.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d21263e16dafe79b9fe2f998847296e575c14e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d3ef3d5ded0dfe2a0bafb52ea8f841658db35fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d498e418ebbf33bed58b4074d1edf3d9bdd07c5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1da23de9604b5d98fe02529075bad995954c12ca.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1db03461737f1e359f389a8d297476f9b60faabd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1dc6e599144a093203fd7f92ac6d3c2cd7180d49.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1de2f97d49f015b9af0b186801e939c6f357a0c4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1df893ee660d37fba7eaca452ae65b3e45a73087.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e22f2d99804198c61251b4629a3f18ed3dcd42e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e33ce1fa113b221e5303b4093c2c4e748ce8298.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e42736d4f677a59a172bd6f162616a437696351.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e7d7888480b83c78833214b32e10f37a6e20301.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e9130607a2d24cb0662a47e9cf12c6602143838.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e943fcc2e64c618fc1415b3f1a0db4d70aa8494.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1edaf9d4270d2ac61c299320e06ba73f44730364.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f0cad6ad5b172e51c569e84cd54a19b4eb0ed05.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f13a6d0f8c798c0c4ba4ad202d081899fe081ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f6bc5faf18be193212217788d476ce6fd384bfb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f7faa0b33a9aada86f032174afd40d18efa7715.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f81f8cce0d77dec9f977b9eeb0778b70a13fa75.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fcdcb750f382fc7828a9886585f50efbe5be735.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fd9fa7c2e13d0bad5fddb2b5a316bbc09d397ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fda1c96568eab89a8f6498f8bb23c1223cdc7b0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2005aca3520b171bb82d10ad70fef44f28c19776.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_204a573ce6b7d2f90aede543939315561cc43177.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20588bcac681a5d69f252d7523a3681a0c6b6181.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2081430c92864c29bb9f409e7c27caee1de00749.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20d5c3c86398f6ce55abc90db3e362dbf9f457f2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20f7ea0aabd069362ba4bbd66623cea5b6e1a6bd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_210ef512b7862837f54acbc3b21e135a192647a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2122c973581930ab7a4ebc90b3bf1cdaa229a87f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21411df58165946bf02942b597d94de7dd856987.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_216806a4598c885e517e664fc8280c59ec3cbf11.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2173b7c710d418f44dc2b41bec5905024334eae5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2177d95cdf45f6fec95d1812f2ef183a75259e38.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21828c7d3f5574690f12f841c27f025206e6165b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2184fba2eec5899bb40d49d4508196e6be1ec1b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21e235e31d6955393ac8e825bd69ead70687b7c8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21f860d42fdc2cc6bd743d53ba546e332c22fedf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22105635385fbfb5d2f330df83ba6747bcb27f6d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_224f9af5e5ca519b21b71a54acb49f50b4999c47.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22511de2592b6e350737e44865e1fed6496e3f32.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22632f996eb63fbe4bc5748c5897b775087446a0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_226662cf1c9900a4334d2cadcc5f5ac3ad355f05.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2273457ac3be01cc1595a015a5f598f8290c77e4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22a07ecf1a59f72ec6bef3e970d7f33cf54c5f44.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22c142d869ef940ca876c93033ad53b576ed34f2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23047ea90076e3b0a3eb0586d49b9ee74ca6d279.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_230861e81e5acc523fa680534eed757b7b4a4e1d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_232f61bf31dbb5de5d7039d5ff2338068a759b68.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_233132e712eba8972ba444c604f89e01c5b84cc0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_235bf652702c2976551778b9159e09188575c63c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_236b3eef02b904304348b9d35f715b639d63218f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_238e4c1ca112afec494fbe47a85b553302c43395.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23914c00690ac5c4f89cdbbaf00732ba66c5c0ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23c9b46da8774462de8c24e14b12df3ed596eb57.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_242013527a0266ad479715ee3e6ae01c45de29d0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24410fd9a4150c33186a2a365d06d8f6ea621c20.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_245d90000b55ab8b6055b1934880fc6c4870b34b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24643917fc970c043d1c80d8d4b17ec92deeb8a1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249668a3212cd00edaae871758be30a5a1fea589.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249e6b93baae25dff97a0bc9145a8d328ed3f317.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2543da478310245e19e6c6a0d9ed7ad99540b3bc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_256ef175029a43e64164176d4eb212baf9d27bb9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_258d747083272ea657604ac84867ecea17bd65da.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25938733446b6c0dcd159719f08d04a9aa467967.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25b3225da1e1842f83592971a1f62a0fe30aa9d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2660282ad39ef034fecbdb74acedfb48620b7dfd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26835ba70606c769e56d19dbfe74061361aa855e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2695783ae8f0034692efd6563f789ef03fd0f4f3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26d77b228420a3ead919474ec9c6fb2800f86890.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26ea90eb5a527434c1740933a1d2dd863eccf14c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26f90358e522d7bb7c76c3a2c6010f0f38788bb6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2703018e71d57d3266fc35e2e18a78faa3dd52ce.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_278639d44a4a8372a627a7c31e9527c8faa26f97.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_27c2000d32c230a57a6712f27bc0fba02722f5fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_280bfced8745fbd9266207463fb41476dc23afff.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_281d897ad17d7f6db2741b396e6b85a9b8f35286.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_285e61dad8f63fb973cb2eb899c959e400622652.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_288458c5a0720ef152848713119ebce6d76db6d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_289071756e7d0582eb61ce6483fa3c988d2e10b5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28e4d2c757e4b8c366a2c320360e21ff0ef671a8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f1ef32c4384ec26f3dc5e3af6a74fc8cebae92.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f2e2b108a53308a0cb6c123c8d318cbc2eadb4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f7634d29bef11fd466b452a46b0612f38c949b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_290c484c2a366258941ee0051e139ea716a9de2f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_291a8bdf9d63b112e7fe5fa7e8835a6789cb8ecf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292454f2d82184ab0491ea0675750c6ec55d659c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292b4f995d622826af5d1f2bffa7ba68467c841a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_295a523f815eb822d66162d4feb75fe0bc50b648.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_296c5836ba118969c4ba89ed62a98dffe3105738.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2995d39cd62f20622a31f11a292ed175abb5fdf9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29bffc159b0bb826ba489ae763dae141bfe8e802.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29c9e5384809b21f39e78bb2e43af345a9a21d19.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29fe68ba10b3480dddc9866c51ca8b5efe962cc3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a3a980a26682d879c3a3425f3ba5be3f5761adf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a45129fc4995abcb8f880692f11c6186fc01641.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a833fc01e88bd8e256ef64ae8251dd0ed10720b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a97c457144cb63a9c6c3d6be613b47bd0df9928.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ad492377add5c8f6d0d2dbf9ee9e4338bbd9f1f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ae344010d49f7f9a6caab2cb84be7f87d2d96bf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2af6c5be53732eb1939a2f93232af7dc011dec1a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b0bcb241e5a1be1d35366461408d06e095a26ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3326e055da32cc979892a2fbd0f7b003cb9f98.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3af90387f1d227119c5dcd4b71362940bbce52.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b4050988e5790a28dbe10b4c20e14f10f6cf85c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b49a9b0801a06dd89c7f7182d7590b515df1592.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b50073f6dfeb7ea77d5dce288a1d2f08f8f6362.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b5317b6cde327a842170ebff20c2b03d81379ff.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b8169ce4b4b9a17ac96fbb232e6a93f22071ab4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b823c3b99e7c8d1cdc39a5dbc7365a383bf9ccb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ba934408c75da5479cc41f96b98ea7d333635ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2bb6da1095bd8669c0e48b5cd808cf0dcefa2674.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c0bda0feaade2b554d648d72f219ac9c389bf09.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c2e75e6f659a500dd3cf2cfd65118f111342119.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c77bd7e89ed832cc31b2995566a49bec6e4cb52.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c7aede7762a524a7a424cc4dc46e43fdedf73a2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c808da5c2514806c2953bb77d5692e5d7c97aa3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c82e3c4e445e1e02f14435e4ca01a90850139a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c9756060ac0e73dbcfc58a9222a78f0283cd029.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2caba3ab83239e474412fcf89fe0fbef97e51bf1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2cf351fc2c2da4a8e1760a3affc9a5947c6b3bda.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d06f77a4054ca615d96636c0e2eba2a89850142.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d1f2d1e57095f756ddd11e8e9d4f6f253e3ffa3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d23a26e0a59a8323dd97632e610d24624143fbe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d43460c011b8d5e01ea98c9b8ddce962de59a96.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d446754d7000673779d15d3e73039fd3c10a720.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d7b637e0313cb423b22cd8844cc2997b3ff73e4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9a04b7f41dd6f0db017157a44790f35c626e2d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9c659ba43bb907fd4e3e36a50958288bafd1a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2da2b905c4ce32234c2af62328adae6b1f9217a8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2db33b5442d2e0948762b1f2147a321a9d6907be.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2dfac5a83def98340c8786d55a30a98ad68b9eed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e30f50071113dc4ab59468d568ac9deb06b0342.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e43e401abbfb1b6737e4dc822f68421abbc648a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e8b4260626beeac76c26dbcee3cba1457b30e99.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ea394a09c8691a534ad2219bedf73724b6dd5ce.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2eba937ff6d0302ab013db7349d4feb914107f1f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f0247e301a7b076b6ec8a778c3b47e330638963.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f32f2d658f1f69840fbad511ce8a3851c859d52.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f55a23a0f24ff7062a4c286944f25d2db3e20a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30024440e780fdf9ec94deccc85216d8bbb5788a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_303b7b04496e4db7c1ba2436485dc7c8a4c88448.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3076a6de0e2612279e0ed64612f7393856bcc9ac.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30c8e4d5c761fda50e010da779e8e4730051d403.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30f0200092b0e18d57a9f5e512d565f1c0229436.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3108502fd29d3a24b32177bcea968121ee809115.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3110540b50e95e99a5cccebe47d9d3a83093c2fb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311104394c8bef8d4ecff35c1409221e723a5a8a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311731442b756308c0a869f21b7b8b103aa613e8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31222e158484773d2257f4a31e3dfbdb68336a8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3163272d25bc2db2ffaa1fea87648b45ee68d408.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_319df310195191895005b30151da8c1afab6c82f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31a968898f0bc6366313e41eddb5e3a3ed12dc98.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31b807c48c472e9b1311a6037cd98e21d6706889.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c3760f5978baf9780ce4587ae4c768af0e49d1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c4b866692ba5c3d115482bef4790733863c1fc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3206cc121ce8955ed59ea3b12b858ee2e0cf82f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_320a6196b662a1d3dc7441a9536d825dc356b95d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_321500dd4c41e4d68834814a48a639f5ca36a2fb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_322a86568f89a5a5a165cfffbae9ca6949f2477e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32438250078ba2a47345ec4955dafb4e4de78a25.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32527660fa7aeb9a951a9f2fc3c53989bd141c48.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_325fbcb9e503e68fafea08abf86a4951f440850f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32652a27e8605cef59c8341813b68e7513be23c5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_327e27892bc57f3dec0da24f94f2a483d6c9321b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_328a311bafd1c153525393b252e4170f8aafb370.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33099fcfc218ffdf69edb4f2f0e46121bea9fafc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33746071156e9ad46f403a539dc237e0a44122a7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33e7c1e5f41a451c7baff54f7238b220f1bdf8a1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3400f0af03743dce328486f8fc805dd30bd6da31.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3408103188e27b3bc55dce0c1716c0b4d32d6494.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_342d29c85070f488a14b1915f948e5fd69019c99.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_344932e2655d7b32704be8de9a63bbd8c3369f02.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345a939a2491166dc520e9a2b9de7e43671e0c2b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345ea796c8d97bfe3b7c9663bf15e2e5e7696235.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34807a8e90bf1cd839f32fd718afa6469c35a4fa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_349241529745bf138552f49d9a93db418663ad65.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34c2db98d8e2e690f499f41cfd5afb831b756f54.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3511c54e6a6f9eec378d8b661121066536195d3a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_351425a006aeeff4d69c8570cb6bf1e1427d2c21.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_354121d3bad1d448bd413718fa096f54faa12e95.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_356f83cb96d0313abcdb24955edd4264df72aed7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_357f7e626135cc9176a295f3d1f336a7c3852688.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358399e756ed5026baf3ab78af17489dc07b9532.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358d28c958c0a831a615a4811d13279b18db09c4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3642b78913a853a62dbff8b99d9ae3fa458f461d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_366662dccf2f650bcd8123c49006c759cd4c0ef6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_367e58867c46d96c9bbaa96eaaa9f93595c9e099.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_36a0a960541bd8a2dc6741579de685b7c0a5f6d7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_377b70f54cb2778b5ce3df936b477f775eea8b3c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378759ae25465c32960487375828e23c5f1ac869.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378bf438642e5d863e31145ada2a0688059aa5d9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37ad61bf8427a26775969f8a9166fd0bfb7446b4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37fe04467e87ec2110f60c7aea0cc9bf2ca07481.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38010c9bf7341588f071f889b7a0b4dcc4e7a14c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_381b29d9888365bff0f109d897b508eebfd8a61f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3824e97d5ecba46e06d5ec1a9456c810d80227a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38273a2f8e6bbb42ba0b0871b6c95abb34531f33.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38a5ff72f22e0ad040a281e66b1aca0bf3a2aadb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38abcbeaa4d33d3150f2b0238bb62ebbfe960980.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38b94d76503e13c911781169fbc378517332c42e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38bb367362fe2c4849ded728ec5dd00969ce188f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38e12dad9e3bafe177ed3c27c833825813e18fc3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38f8a89468cf9c8606cf12a930db062a83cd0ea0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3937d9dfb68351de2942e32f35e2ca1ce71edfa8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39422621a00ff79b2f5ec0dafb957c77693537b3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3967a8807c9451b09227c0f685c18aafeb062fd2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3992d5df4ba2e999caf6889a852db4e1ba078e65.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39d3071347a0c98f3221104036f477aa13bffa4d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a1dca5feb864e8981387c2d07e62acef1730aa8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2280997eb6f1d091094fc54cecf42b7c9c3a2d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2643099365d0903c799585f41dc1a525ac9f9e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a6b9566559ed2b1c85f2bea1c55e72c41dc47bd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3af86f458fb4dfcceb7db3357fbae0dc15142a15.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3afbb5ac9048a962a60f48886728220ae6c2aeaf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b26eafe76cca8e74e819220b6de1f4279d48e43.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b4ecb47f9ebe8c2784976c3e9bbe4834b475cf1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b508b92f7e123b21658f6e17d624ffa87831fee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b5b3c218e4a7b459e54080e24c5b730221eac02.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb129e6dee6848043dd0e8fa812ae80fec4d014.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb3b682eab96e4e173affad75b9d8e73f1dd690.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3be7cea6df8e6dd56194e1172f28943667f1c4ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bed3aaf24c73073c604a3b23bb4b0358b8e3490.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c1454ffc1418dac641f63671e947d9f550b1f0c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c38bb80e9880335faaea81985ed5d0e713ecb08.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c3b7e4b8c1efe59f79a15512716fce2282a79a7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c64c33870ebc329921cfa3867d58b1857421f65.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cb0cee09d633b6f70febbba63a1e090522cfb4a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cce3baac1e3ca03af0c3f4ee4d0158ad1031e9f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ccf0a9d5a5451da5dbf6075ccea45e4a140550a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cd7a9ca49c1149d46f6b05b0fefc41ecaeb6ea1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cf45927b6d931e31e2209685d787efa28eed8ba.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d1cea88a2277b87d405025ba256272a1720f88d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d289100991d4c8c362f64c8f6c4ba395c2f3495.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d3f3eb2f5eb1f3287879604892b1c230df85f1d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d45624dc6e33c477c73a155500b015b6c010de8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d55cb42b0096a8ae338ce100f86e378aa1a04c9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3da8c31f6d5bcaacfa4a21aed4d1d3caecb48922.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dba3cd44f78c950fe7ceaa5f0629dfc607b30f1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dff884e176ec7cff86d17c6afe1ddaa4dd6007d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e143d88eaa0d9cfea856b2f3a57d1275a656627.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e2557f206fd81d82a3b9d59113105040beb891f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e562e6c3af28b8478020ce3c3bf73c036001c93.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e61b019e1398a6a3c36143fb84b5ff22c9f4508.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e839660557dee9d5bcda9b56940ce23236c5f6d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3eb2ea922daabbba131b90713e06d8caf5f30662.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ecf565a5a1c4a09887c67ac3b9a019dca427ac0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f34433b784d1e405ade3378918641372a30bf6b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f5e01b4f2ca8ea10898c39d6570bd74e85f46ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f7315955f555768f24585a50d75e216c40f062d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fad30ff0739ab5dede67a96e859f8c474c245f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fcc6893456a559c7d22714116022fc69b372266.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018b1fcee808b6cccd131418b6ae9e8bf900d8f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018f690b6322588041bb467beabd8a7bc79a2e0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40357c5e9739eae136a7abf92bc38d3ac94753f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4052ca6a3ec02f6559e4bbf1edde42ad2d127c26.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_405e7efa263223148318ae96bd1929b382e994e1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40aa64439b80ff8dd12498b3e5f6b625da16e285.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40db688a9189e1c47c300d474df946a248a63303.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4118e3ab290263ed2576feaf22a1944bf2ddcb7a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_415b183c50dd2663dabe3eb8b780913b778c54ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4160f6b6d0869740a5a411abd80108f729f810eb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_417b1cb14b67dc82f614831550f7deb0895bd7e4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_419461cdb5687ebbb7bf0be136071d70420c1619.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41b68458076e6cb129d3ec793e95b91430a0c8a1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41db3f29d1940e59dadc357c040ea37a6ff208d9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4217a48a1677bd26cd48e512f1fc8830a8a551b8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_428ce4e14cf94b284ffa735fe03d923cc74c9fe0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_429b82a27571ac91e3631cbdb7e0a58155abf962.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_42e2326066c91452335eac05f25a6311376bd9e5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4306c6c37cf472ad262f53941611b5e60072bdf6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4347e039c003489dd528faf5d710e687321a3fd7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4356b3a2ff49f72b91a6b9c215df285f2798ad47.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4377ac04be3a6cbdbfbe57612a469412812fb5b5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_438e3565f4c720e6c9691b0d33c1392936e2e7ae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4395d3c96b3f4556b9765fd0a3b5701b2fb10948.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43e7c78e8f65be35e2753a0ad5123118555c56b2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43f2156a04b18bab55af60e9357f28d8a4604e8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4409f2a7deb027e864afdfc9975d3ab93c5dcc9a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4432c5214c4d40c54ca2d02f0d4785c6d6902370.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44462715ed5f192532760d6f4c66ff9d4e20e254.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44564dddf8b492d80be54854abb8d1d831e42679.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445cd8fa559588f4264ce6192f2de3e3065365ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445e28a8a51cd435130ded2abc9fc606e522c713.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4462b192a64efb60d5484798526278ac7a0fb9fa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4466b6c6b2ec3acb40ac1cda432efa1e4e62d9d9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44690e48f30657b0fcfa26fb3b9af3ef76e792e3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44c181996532676f2140fd026707135144e9d37b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44cc95831c347212021c0bab7b43acd7daabce42.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44d82b58fdc3e5b7a7c20490ce7f5acce4e6ec79.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_451fbbdc2dcf2ec81efce34673ee6c425cc16ca2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4568af1b2f104664fd05d21ad789aed39ecfa42b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_457eaffbff3c58183a656687010daa2c16cfc26e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_458d708d13577f2b92e6d5adfe952a87e0cf7be5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459c8fb6028991321b09a990c2188d854d940268.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459ea3713aef9b916e1b38a882a45012930924d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45b9871c220c0065d74bffeed4021d0304a9625c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45f4363f50af1e7ccd24751d5f5b181bf32c604f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4601680af41c8738089ff377147e0547dcad114d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_461737a13e24009bf1a5a4b780175043a9f2e33e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4666db0ff7b035e54f2c0e59acedc2131b722a55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_468a5f057fd5cef2df5f919f5102f47e86901e3b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_474fe2d739eca8c93fdcb2c105d4154cee6ca1c1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47548aa042c69bb9c59a8bf706b44028aaa41830.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47f3ced9b5ddb0dfee8ed5e7df8eca0bbe273047.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47fe73f04cef91cd2a0682e905483968ff80eadb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_481415463f0316ebe25ff2fda47c68cc54db3359.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4824e1f8cda50f80988857611da766685da94494.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48280c91d7cd8712fd533e246a6b0f758834abc9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_482e34930d11ff493007b1613993e01acc1af78d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48300e0aeabe337785d4c7b41796ce65df6cc42a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_483eaea4096c8f5bee16a64860432f0634a253d8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48435e5dd23e49e19dd313f9891ffec800ce74c2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_486f6c7c7655c34b7b9973ff357b0813f0a3fd7c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_487724686efd35731e5335efa949486c93ae26e3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_489e7be0f85656d012a6451b65f6c1d2613b187d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48ae3af78583258c4b13c11a442022e0e058bb85.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48d7d145f96aa8958a9208d0c8887742a8c834fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48e9e858abf6f77489f3fadc4ee81edacd26705a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4904c5910a2d0595b39a3f87652a9d1ef4fcbe80.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_490a68220a7b621ae9817d7b77f55de239b0a4f3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4911bdd71351610d55916d452495e599960d0a41.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_492fbc418e829f89bcb8d93f8afd2869dd8dfccc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49d4c005d723cdab9fbc307933c1257d114b539e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49f5017cc0f5c8c8dc71492e7765cf729c1f225c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a06b5b153ea6e8b1e20d9aad9d4633333fd98f5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a2e6b05e7e4de2cb23d815f8b2c8adf22131c0c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a4a00bd6ea27ff20a2903d619e1361b5e27672a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a5dbf601de5754c03a03a1a42395dc0766fb8ac.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a9f3da698a6103caf25d785928dd9f814ac27b4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ab5d6e8fbfd92e9f7e47bda5cfbb0d4162a6319.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4afd02981f92fbef6277c1985cc479c12bae9239.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b1eaca3c37a82d19f8dc91f06764170069ca3af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b2e7f96b095ebfb66ecc7a75752fba2a63e4f37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b30f472f00bec9da0564ddc40e07112b5f9a117.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b45948f2795293e72530b02669c4f549608ea7f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b4c03c916393d6be7c5181369ebcef949eaa763.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b68e4d00295b294320b94bc777d7d34609127e0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b7393d55600c9892558248f4131fc06a6cf3309.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b74439f42140cdda9bb0f78d995d741212a35f4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b76e5dce9af523422782dd25d8dcf6f25edc68f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4baf664bfdf070362bcc91af77d1bc406f744351.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bc48576f285325345fa1205e5e7e01787b74f71.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bd4d46397a3749646b232b306688e52b8c6e584.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4be4a98f150f3f9ab6f03b5fd0968c5454565c9a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4beca56234ff6fb4f23b9b24822887fd9a3d0df9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bef4d120e71bfcfe61d67aa44d24ceb907c2b9e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c0c50a1fac82d47dff2357ee3ddbfa0b2c8d487.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c69d06e3f32e3b6d28d3e54ad764b472741c193.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c8720923c3452e3aebd7b9c1b4b23f0c35d7e4f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cabdafad0bf803223ba5e8f474cd59233dc48cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cb1861e31df98bdfd731efc3d335055090d83af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cd3de43cc1f7588d62a10362f59d113ee818846.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce03571f1d2779bdeaf0a6a2d617e236d191c11.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce671f5defd76ca08614a7a1f184c36c0f1e2ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d3b1ae63e127b6e6afe39e354d4995afc5faeaf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d5f3cf0f78f73df79665c26b20b0805615e1b04.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d65e58c9f147498ed04dd51fe1393770603a6d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d7dc0f356b630179916f8fc2041b7f1402b46df.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4da9e9b7277bc90518ab92860bef2097ba96d982.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4db2e63cfebcf84043f79be0321708cd159c62b9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dbdd9c3f496a27bde68cf86374999ff2dd53505.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dc87b7d385e7b092e4706c464217b004fd8a6a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dde56efe17f4fd36a11cc959320a5e43f1dc232.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e0a88ccef04e81b8c684b695f7cb4310e448915.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e15e4f16de26068cba30ef12fc29332d45e460e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e47f8fa40332c6ed12d9971e0b539049a871c34.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e760de14b71a41882ec4a2c7362565af36d1a5d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e79dce18e49ffe024fe4cd0693ad3399f5edaee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e9a933b916285d9580a76df543cfafc88a536cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ec2075f394acfb14fae7b1ef4304fd9b654ba0d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ed6da5357b67cc28aee4afa9523adaf055c4e32.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ef35d82ceb4af2e07719c16109c6d72eaedce67.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f0aded9d1baec3125ce8e176248cb146ca580fa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f1e1c969b57659e7e1367ac9ba10ed5ef5b69a9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f44435491aa68acb3217b0e693232c67641a2db.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f4a5d56721bb1a1332a65882132a8c5763932ec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f6243c6850c0a2d2b7bf1476e12f95f187257b6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa4d21931b9afcbd70b1567995d3eeb6f9308aa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa883a36a76edb276a66c5d779294f170d6d4b7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fd34faa8b168e2ac7862641229e6146d3e28aee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fe530cbf6363a8f08a94728e45e88ecde299e7b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ff20bafbf156fe8fb80bdd84a5d2f3a4a944c1a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_501dcf3213efd214cc2ce8c9ba0027f991d241b4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5052b2318dbb78b1a82ef03666a35a623f44481b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5093976cb7b32a8bd28ce92fc13af00a3e21f737.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e59bd079f4d205b613056f975fd2b4e372ab10.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e7b11019fc2299d70869253877319b03388244.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f887556a3540609649744957651ca667b91774.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f915b4d9bd18a3c25a85917392ea4a5e88b349.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_515128c6978449b33ce0c35b02a9e9aaad65ef7a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_522a2a9435103ed405dc1500d31652f1d431a49d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_523e5bf45ec5008aa3aba4773e68a78e122b2fe7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52688999141a72e61322140db29043ef9f7fbc3d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_526c89b7a04758b4badbf9695b316f877b8bb053.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_528db08068589c6e4c096054d26a2e5be63285b6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a89981a05963efcea7ba5c1e967638beeebbbb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a8a323414448c50571a334f29bc0a38919b61d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_532a6ffd8a21d3e98342fd401f0247f62ca4e038.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5344427df3ae9392c4fc4c25c232196828e70648.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5382a30dcf702daae19bd6705864bfe36e09502c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_53bd60bd2afee49b30a583c32a45ae9f2076db08.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5403eec1cdd216d5c4a7ba977e2ef92a0d7fcc8b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_540bd57333c6839ccf5cf2e928edb996bc60c371.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_541874a7633e5713720b9d084b6d1c6715a51a17.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54208a6e8c5263e38f9ffcb062564ab61d2785ff.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5435b4651a90e331fcdcf224282457e3dc038a30.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54402a22ceee3b665a3f24edb98b8398c35c6f5a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54548ad36fb92d0963893146c8db20f53cbf0c8f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5467aea26852aa9a9e3dae76b906005ddf6fbae1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_548b347672451e8391388a400d016803f4c4cf8d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54940ce53998becf9bddf56df7d19894a7658168.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_549b6956eaf678f7eb901567d1a515eddbedae5f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54b6e18b10d529eb6b32d7c19c59eaefc7184376.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54ff49018f1c12b9fa31e523ad40b9cc162ba34d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_555ba79201a585bc091ccfc326fd24e851d1eecc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_556cd05288e1666f5c67fb87ad02ce660e4c589c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55b14cf2998a61611d1de2594e926fcdc378999c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bd9c4f1b7a0621c67f3e964d946ce22fb2fc80.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bf8444c1c26b91fd490c7216f4d0f8aa0a1f1a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55cda610c235987e13232e828f8d86fa88030560.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55ea83a47c6299fefa4220ed88f7a8e1dd938215.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566b4782793c6526bfce7362efbf6bf069928b2b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566e26d4969bc6bbe9b092bedab11cddb3360c0f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56964a17f902257aca9d08c736516a2c67d9a0e9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56cc4399c5567a9495f17d54c712cc9e65e57521.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56de9a7dfb1201b56528740e9d8a07b62710fcaf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56ffe9e21362afe9c3a407c09d5de186954931a6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5724d91c1fd6290a6cf8d52a3801ac6b921dc7d4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_572e68bd619e118292768f0925ccf92cbfa68415.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5732094f5917e9164ee0f973ac6ec47245a69101.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5789f267d34c9961ced63ad07ffea2c6d2911415.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5854f09511778dd1779a839b0b194896070f69ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58679919fcd292a2a69543de0db94e2985c9d364.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58762476c7f2bb05dce92ec22c0acbeb03676746.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_587fc33d02b1932235b8d152e57559060211d591.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a784fb478ff5b3f1e2da9765a3a777efda92e3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a7ab44bbd9fbc97c7805860d5f6ac81d6ae468.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58eb2edc7738d8d18ac359691da261ceaaf71788.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5919133d2ed892745013b2fc5d503414cf0a4d83.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5939e6610e41aff8d1ccdb66d9e84d3e48e8d379.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_594929c433b049a8cf949ff476309a8faf5c25fb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_597a0276ec419f18f060a5186e6bb703ae434ac8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59901147b7188212b8d8feea15831a11425fe4b3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59beb9cb4e161f9dcff79080149076488d436301.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59d366421e0b51c90fa53c366d47ed8d51b3a329.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a05b4e7782bd0e29ca9f6d33fc59d4304136d41.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a216f777feec4752f5882677b18168225da4b53.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a29b93cee012c79d4364502f1d90f947c73641d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a85ae0a16e4b293b549bcb6a3ee52df7fccca32.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5aba1183efe205af38e79a1b2dccea5fa515d02e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ace1c9b00f160a17355d4583d49c47887ac33c8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5af96b404feac271dac8f4190180754480d3ba80.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b413bdc825ae863d53dab548f2145dc0de8fd37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b55946ff3c15a44b9c741e9f6bbbcb5bd4c8577.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b7a4ea3bb8905a22ae97a94c354b1cbe38093bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ba578c0e7abf1127dd0370f06d7278656c93ab9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bc803342862aa30e23e5be7d84e611bc571c529.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5be9ed84ad9be1627db7a66af9370679816c0897.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bead6be6e39ece0e5d44335083336f7f546d2f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c36fc744dfb0d985c9113175e76c7ec1c935054.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c742b9ac6749f189d597ac97d46d35189472c50.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd03e29403ad53d6d52e5e81182ea6ff5aff2be.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd41b6f578f3c903eb9d58ebfab62eb296044e0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d707d065ae152450f9def619ddc3dddb9089e88.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d7ed4c885fb32a0b548186e56d64bab98071d30.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5daedab8931f2eefb649b91e80145cb71b63360c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5de27c4081377f59363c2bf2ea8624217566d2d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e0abf4e2b6be3e2c555c2134705b9dcaee617ce.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e62968de58d9df7d687d671f37d63393f189321.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e735b12d130ebf849ac5d6752e413ecf3e69fbf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e840be0741afa4d41fd4789c8300223fdc63ddc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ea53f7c6370845fa94aa9b395c52fd1900b62de.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5efe77ca5c394a60af0313072cdd132216a52bf3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f20263fd84776f155519b3481be5e2c5b035585.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f3c3bed2b584ea2031debf9f953f5f8f7012171.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f71e663978dbcba859c5114ec675a712e343fd6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f8925f929a5b26f3544ca31938aa75b3c59d34d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f954a393b7b5a7131c13d0c4578443f468a738d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa19223cf296d7fd10e15e2571e63c84a80fbb1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa7fafd4227918e0c7f0c6ca3b2bd673cd07279.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fb062527121e627871b3f1b2a94b96c42e51205.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fc66c5b53f83bf1e023e81e9d51f0285b3ae731.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6018ab272d7306689c7dc5a6d5326efea1471235.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6049c01db99fce654e9351e711b113cf7424550a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_606f5e0b99814b0a82a731de36f28024bc317801.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60801d21c14796c08377349ec86a6c800af497b7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6082d55544b5280b49b071ea277fb1827193fa2a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609616f72bf16a060fa50091ac139ddc06bf9d88.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609f68180582384ba81aae2b1d4a4c52dde2c68c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60efa9c427dc278c0d1bc31189f683cd45e4d873.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61204f6805d5d830aa6fca2a9b5f238ed63c3a73.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61220f6dca850a5b5ccf1f619a267c40c37efeca.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_614a9f10ebc51bde3f580ef527c17f89489c12c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_615430cb65d8d540836c7f12b3367abd3c8e63d2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_618031345ea71cc17e458eb97a559b7c94d3ae43.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61896aa9e4e4d7e494c1755b1e77a08e0e264f8d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a44ac409e914c12281f1d26e5b52d8bfd0df75.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a9e92183ba87924e73ff0b5e25bd12d6038e69.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62048a8ae1c0096f3372b0114c15edbe813425fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6214f820b39a8ba81e547a78ed19a909ac13221c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_621da34ee666903307d3a09b7a032f2a70054759.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_628b28f65f19e7d1b22fb3b85b7cf3d09cd54ebc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_629e0b97b3fece7c12504f4c8f1860d611b57269.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ab710e4acc711430745e05e036dd6a4d6bcdca.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ba7a5a0f3a714eb5f9f2af20f7bfbc82a30350.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62eb2f81e73d65fddce7ff43c397da6529317607.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_634d530731c7ade2c7beecfd1bbbca8583032217.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6360621af3f7e1e81a8be48fea8d2750fdecbbf4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6376eb68c550b50b9aea42a7a2cc3bda186b0e40.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63c411351ec59bdbed2590c599f9eddf7807b371.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63f121a3c8928c10a2d86b487cd13fa995da670d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_643b3798f11997d33ccb58d90ed6c10d5411b735.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_649336d59a8b35919e593217b6fd4314a04ea359.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64a0ca185449a49fa485892fde6af745ba758167.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64b3488ddf3bb1a4870371882f0a5d267bdfdf73.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64c3c1e3dac623f07c2dc1b934ccb868cafcb38c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64cf03c0aa3f1b2a7b76b4e3418eb5063b982a29.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64fe2db75cb20428856b02cd1cc8d7b393a6ad9c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65794d9c185b21f59274ac5d4db10a7abc0be968.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_658552954505a2092662071401e135e84956c4c0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65910c8b7a30acc731948ab58467fdbe4fe32f6d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661b49505cfecbe4ec3e5c7371de3aaaa85ac9d5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661ffaf653085dd7f122d603bb3ba4b001e5f3c0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_662767e588220d0dc6137b00cc1d8dcc91e97134.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6649f19deeaea20663bee781af7edced7f7a4fc0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66968bbf7e210911fcb95ba90c79837230ab1ce3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66a020f728df204ff51e37d2ddc21afb0aad5e7b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66be70b088b20fc8de464167c35745461ddab640.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66f651d3415562206c1049b172261fddba01ea6c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_671828f15eec2a58be23063a1a8132d337cd26de.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6767cce35ab784aa42ebcb75af7305bc38a8721a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6785dcec0197fdbb50124ab06efa627f1a2c0567.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_678a4a8210a972bb2ed89d6ac754fb79438ab2da.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_67fb736c61088b8dd92fe0371f5c98e23bf9077f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_680e81c3700f130df142c9a37a368944ca548721.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_683e8a33fdb7053760c9c135002b0a94facbe015.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_687f4aaafd1a5b9ee85aadc6fab79ad0c27a2ea2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_688aaa193f332ed13e017e78ec07a7c80e45f6c5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6905ba47078abd7a5b6a51eb93b26095517e7f70.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69214eb450c3b249017480efb8d092b0edad6dc3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6979ef43adffdb62100270a62706fb811963925a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69cbe8eca7e3510f5caa7f13419cfbefbf031754.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a3f42d5c9ccdd3807e488b00f02bc6ab5d8d99a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a4b6226b355bf35d4d07aaef1828091f03ad2ec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a66604bb15f97a56847a7c968dbe32d247cbc13.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7b6781ffff9a42beebb4d73f0d15461ddd4479.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7eb3d86aa385f9ecffbc5ba10489e56856f918.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a95543aeed81adfb6d847f78212585a36122ae3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6abeb7b50ae6a1fc62535b9a1dabbde6f177a9d0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af23d1460abfe875e71f7911697c42fef0f41c5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af4c15a119e805e4407b184625f57966f8833d9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b0ef67ce0f178aa2863c4909f5bdd7f766c9b2f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b638314efcc4f16aa4a6e58e6caf2fda1711519.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6bad2ed9f91bc1efd89ea66cd5c775fa140cf931.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6cfb7075345704340ff33dc0ef7c04ef127f26ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d07bf9c05e41dcf2416e05dab4bdde17158db76.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d17b92fab5bee7717bf9aff6a6bef7cee3816e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d307974bdeeef95cca0d130ebb7aeb77fb1b6eb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d40d762ed576832b3a752453e9881b5fe6d2650.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d470f5c6fb81032fcd7974180297d4bb2a8427d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d5aad18f59e47a3fa3278c7ef1a6372830c33d5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6db86621d626722434f2ae9b7b8ab435a8dd8827.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6dd707cf48a17d31abef94215c5720419faa0a39.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e240106c771ebea461fc2a87b6da68e510aba70.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e6a4475ea795935f4cbf2dc0ac156a33d754587.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e7e1d245baabe2f6293e3d85318f9936b333500.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e8cda718e10824956f0ee39bbb0891eafa45a7b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eca9cd905ea8b0454cf9564643894682b08cb97.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eebd0c2fbfc85f938b10535855c388971129a28.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ef5803b33d97db72eb8a8528aeb3fc956a938cc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f31b3345893eec8ed1ddf1d8de2512b46ff6187.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f3d098f8bb63133924aab70d26a6ed64018c13b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f8788c537cbf6833c58a6ca15c0a36de33c9fbd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f88527a2cdb5adf51407f4661a254bb32d7de23.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6fa6478cc27e52fd9511fbff38369c921155cfb9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff4605d82507fc4bd6e96095eaee5173ea41973.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff58a5186d69efd6062f3717bd315394ea6592b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_703246f1f53a988cf252eff88bdf814bd382d3ac.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70586668a61ab88bc46b763df8f1c2ea52001ea0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70c8e45f6ea7cf5dba9eeadd0b19481d9f5defb7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70cf755f1485c065222be4daab84283a9c3d0eb7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_714c5369aa848021e020d874289e3ae4e0f74d77.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7177f939ac3dae8749cbf4232dcf04d2cf63b48f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71a2d046629a4b65c90d0e18d061c4984062f844.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71b6100efe30d836dab557ea4ac54c4b9d35c6aa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71dcbe9f481c92215f3b636bc0e86ce8f65e6472.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e3980331dc4bcec6ab6f4c345c7b5f71356979.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e5fb3544dafa9da03fd2de4bb9bd0718f6009f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7237ce5f3cf13ace3efc0b0227ae5a8c1fdfce1d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_724d1d4408196d611b2e0535bf8833652acbd6ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7264e378e1ea1d4dd97f6949d66f3492883b663e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_72abb25dba0c48b380b2dabeb6ab7efaa706d180.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7309c38fc8a2d5ad6efd449107dc54a7509624fe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7344f96bed2f56793b1c2583485aa161cdf30379.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7393267865f1c2b0aa1a09a586f54cec98eea4ae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73d4901b8ef034590314048de7223a572d61ee0f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73ec21ed6e040260c4f04ef68ef9307aa86985a7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_741401abfbbbdf0dd1d62df8bc3e85371ead71d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_743176ecb1f0bc800c870861585edf56f88d7739.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_744ec604c577a27e0aae5b39711a9e2eb82801b6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_745705ae121a1a331527cedfe4d31218a428a0df.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_748a3d76e8ab73af9a5d2302d33e3b1d1b866dd1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7497eca4d1a18306b406b367653622a8d64095bf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74ba59d347ce8916a22b40e6f22a3c89e13db4d0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74d5f2aef029f2103bb419cc982cae99fd1a9253.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7524904ac5a2040c7ea72aef5942212f291a21bf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_758b211174da0f398b2a093e7389905b4f9c4060.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7596c14b8fee751d03f42ca48ea4f66e87fc2e2f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7597ce4d2e5264bdeda47487d5bdb55a014c6616.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75a310a6eb86e3e8baac7a930c3ffbef372942b3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75c38912947881caa14b3fc7ab7bca317e296dc3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f2010bf6c478d2f0eba77e912697661306c1cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f21e38ad01fade35b1db40adabd75eb602410c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7601e6aea44b96e94fb019501be6b102c6e6a654.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_761bde840c0c8149b24a8f6f264e963c4e9e8ceb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_765940baaaa2ae6ade43ef4c94a220eaa63702b0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76674fc182dfa6329c73a354aa3adf458429444a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76704ca28a4877a1e84022e022614709adabb280.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_768c80fd3ea17813df1bf19a158186834fd00780.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76be322fc072ca19baa82707e260c6eba936ae19.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76f884e9ca116ee47b446efe9fc770c178a858d5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_770ad1eb1b30ad8f1e7c17df486093129b2d5630.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77200e875e0ef160b311c7de450c137772312d0d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_772016803aa3ca6ebe785557118365f9be7c4339.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7726be8909f631c04d4395fa4ffd03a736f447f1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7728d5bec7941c9b6d5632bee8d67ed92b9c03ec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7764814a0de7702f0b7b5ce9dede6440603f4853.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77a814291d8f01870274149b9d82fb75921d6e20.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77d0223697ed41c4c2fd8830f8df6e5620db547f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7831ce329f2a0812ebb1dd103ea4ba8cb7ba531d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7838849e57ee9cd292e588f587a8079b57becfc8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_783ec08544591a22f59dc12f169b7327b4185a1a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_784c35fee4d372123631312f1051c43e1fa12378.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78663faeb0425f45e8a0da0f7b1a5ddbee5e07e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7872c45ba170f2782c4b5b75cfc78ac79a4cf157.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7878e2a4d3b96a552e03d1ffc33debfd50c9f7f1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e1edca5abe1bb3e7aa946eab6484b7bed806a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e945db4afa1330fe3978bc1bc9ae99828ae287.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78f7e2a2c08cd87702793f91b6935cbe4c22be55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_797750ac0b18b48f56ceb4640256e9bd3a36621a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7993fc08ac5c6ce7a2eceb1227f4e3718dc4cf5f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79a7dce707954e765d97cb22e57d9bd6168860d9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79d0b8053ddf99a4d4447656d733c2da026b3a7c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79f182ae021e23869d7bebf2a9b4575bdc910ed0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a0ab620e6d62259a559e329460e46e6e3f7c3f9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a13d62a715fd717f0d4101f787349cb49cbe70f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a242e5953f44316b6a4f6587ec26283ed6cbcae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a2e032f6500fbc5468183415b6dd1d3e43f0bee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a890b126da2d8cfbf84f048b779cac2dd56b509.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a902ed4ae3cc6558c73b730ff3949778007a230.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7aa14aa94d625b33df1adfa30ef4d91769592608.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ab03a62e064864e1e9c1cd506c1b2e1786a777c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7adf69b51f0a8cc9ae7e250e60df38758230fe4f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7afd1a756247b15b078d15a39e350a07c22982da.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b2d3680c3578c7292349b58843aef7a82e0087d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b5680f97836be4a369802e8115617a83875703e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b67045d438a7e4b8f3a313a5df5a85f351c1be5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b7fa76609243a8709f349ffc0d9d88157f28dc9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b9a3bf1a9b37e0bd9bae6249609e5994dc0dba1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7bb7b63e8a4c1df4eac4d978e166867195bd6e53.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c19fc90e5a9c422dbf529d2def286f47dea0f50.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c23dde1a386436e9864c8fa5f1706c0d2fbfd0d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c3d8ef4da515960bf40eb1feb04d21950ad5ae5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c4710e8f4e27fae4ae079f1667c3a1879cb6da8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cbe4562c51d6829ec5942e11035c452fe318b3a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cdc419d4248dfdeeab1f0980aec35fa134e52e0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d08373ace7087bdaca4ce8b0bc329f553f88d77.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d0f767c17385eb7d756cbe8ed444d7cef72dea5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d12e9cb599d24631c082e3cf65d2c58b6d4d44f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d2f87c021e0b6a27b2d7e30351fd50f06414b5f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d5667b27f15a06d4040354fba3601d48bb9c045.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dac5d4cf103d658e129673549549f1276f134e0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dd260849b86c46b685955cab54ba07d49b47954.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ddd621da88c57798db1e689b93b692b6519ff96.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dfe21ee27f8a0ca0407ef0dea73cd73ae6940db.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e1bdde812c332c9fc58613698568a04771b9fa8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e332a6aeecfb12dcf70c69157fd3137343fb9f6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e6129eead18d13a4a6cb9550384fddabc7a2a16.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e89f79217037e361bb0909d06534e40f5026b4f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9519dd0d0f940fd5efd61bd32df7528ba7e3fc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9c7feb747241c9c7de2adf3a19933a1c4c0995.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ea9c37d92e344f3cc58cd4d1d00f19167e3623e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec038393ec329a894aee9bbac078a40f57a4684.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec04763d635c5bc3e810737b5d948c59f117d5a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ee953cb24e28bcdc8f05783894b23cbf83bdf35.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f6ccdb3c2d595fffd05bc5e6417b157276547fb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f80d44e82e601dc48d4c8b4e710ef7265894b6c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9403cb91d6aabebf081afae94a8ba397d8d24f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9bb3486fee7b7c9e24300b8a4e4ce88a11bfc0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fa76fc1b066a15b08dc6c24a7cf33a58b4cb6cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fe409f4421193fb48a54aa5f26bd6229d23204c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ff65c7abd9b0d8a2df9302d6dc167637b3a72f0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8004763f674dfb3f14b66dfdeb2a046e413ce2cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8007bf7ae1b71bf8ac4a793aa519ad333aa7a7ba.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8021fa266c77e6b5bd1af2a9c22c686e5a6eac78.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_802b21f9588d72c3c3e3b9a3b269f19c484d5aa4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8046f566fa7188c92568b277354e8b06ad382544.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_806f9ab9baf631df1d3a8d801e4cf93a102526cf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_807545400aa6e70ff49a5f38ed6a218a180bd87f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80987e2d765efc320eaee813607c94c80ee35aa4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80a72d70d80b66c19e85daa00497308381050048.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80bfb0e6032892cc58cef4dd403f305a5b76851b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80cf0997573f4bcfbaaf75e40f519580a7495a17.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80efc341089a50ed5669b3c86f6ddd9b124d1442.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80f51f0e178c33e6196df1d2e47bd38bf5391cc8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80fb694fce7b4c3c459fca43c89c6002fbfdaef5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_810dd4e870ceda3ba9b5f0084a4b025b2e609d57.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_811db756577b61cde9fe8279d956980db9ee21a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_813e60e8405aca3f7fbed19452ae37574ada9a77.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_815918206483d2ae04a45aa67d69dfb986587214.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_816c48e129a0235cb3a19124ddb28cce286fb368.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81acf1d17650712b71a499bb66909bfcfcb6aecb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81bb8f13b6f20a72c9ce6d0b53f81eddbf05f1c6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81dd3ea61bb61de02667b14f5a94198f48c7307b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81f6c575c3fa2ccc7e65022f1ba65c8cfc16541e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82048cf91270631f98ac37dc488a1fb2e00ce004.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8250f27341241086515d833aa53ae873d4ece3fa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8278845045d68027dcf3bf867ecde2fb12ec51d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82ad0c0580516485ea432d98f53e73f6dfec548c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82c932e6eaaf44861c794539d9caf8b50192fc44.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82d7f61e6313930f063758b61102e7a43b118beb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f0f3d71108dcc49234a258f0f3b21ea2123cc0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f1d7e1a93bf2fa80c409e6827ea88af56c44f0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8301bfc0394936a68fa0098580f06e77c88ebed9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83080406598df6bd3102db70a554e496e29db96a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_830e3532f27b391585d5de90f3bdf97992b67651.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8352031044ef2e4a22e27ad04ab5d2c02121faee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_835a906031a258c6362313eec783678bd8125c91.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_836a308c2d2afd6e0dfbfda61984b631c4ccffc6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d580a612af85533c87aecdd7b0345c71b75980.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d920a76114c63156740ba5dd6f3846c4b21c28.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83ddca2c6ecbba4314c434e7471ffb8fa642f936.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83f6a1837a65df12b7c55d25ca28cc939c2a6328.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_843e7888cba5f463d19fcb71aaaab25dc3d2c09d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8441910c34830ad2459fb85c2c14af02da718fdc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8457ea5726149efb8778e6d90798b8e48288fc9a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_847feaf237911478173377a501ee19ee325b012b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84cca7528c7d1bf49ba79625733ff0ae7522c096.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84dc4af43de08130a04bfa06df9799b6e9e96900.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84e8ae99e184013739019c93d07caddce532382b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84fc5e94f89d6a9287cf64662a372784511468dd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8513d96a66a4d9fb8dfc84afba7e1d8c200248a6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85156f2c556c6ef6180608c361b7b35ede71ffea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_854c8003a508ed3f8cbe6967c4ae2635a491c721.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85908fe6dc9c629c82d6953081b10021e64583b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85960fe542635079de5eca3c7785890cd4740005.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85fdde4b25e2fc8cbdd46c2850c19eac8d9af8f6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86309c036d96367939ccc3e8922595ac35a3e179.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86513d6e065a44bcb0c789eed1e7e5456e800ab6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_865eb90b1a2d64acc0f6fbe1d807c501fd4be3cd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8689126a7eb09d81baaf8f99dbff8932fbeab3cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86d73393d0d8b769f30222f7817563a955c36dfc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86fa51b8c7a2f3fac5cf4cd2951ed2ede5c35450.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_875b08ca602fe48840c72cd61798acb98540fcd6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_876a418fbe6183d0392b7a7d9986d067e323e2b9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_877e33463b3bf1853c6d2d2009af8d27bf88abbe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8793dc3217e154b65ebba065aa10ab4dc2374ae8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_87e3a06266deda093bdf28af82d8666066157fc6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8840e8899b4e632714632450bcef001c6070f955.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ac7f6cbdfca2e397bcb86af4216e87166601c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88c04463f9c5ce565a9daa8c22e16de80fadd707.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88d52c5f70abb525b9c8aa8fc1cb3997c33ed67c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ea5b5346c87cc4fc1e841c518080df4ab811a2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ed7f650c958a644c8031aeb88688b1e42458e5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_890aa875ac13957f00b30210477924697abf0c9e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89617bdea526d12d6a33ed42b9b0018c0b173722.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89a3327da9a3411ff1cddc67eb647083cd947a92.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a1fd28acfe85b3adac859c4bbffa4d28fe634fe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a58d4bca33c4c0e79141a56688049237d170d1b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a824621a50cdc3cbadc4b1f9ef18e1325385082.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a980749c6b2a18c80426dd189e5506334343ca4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8adbdcd28cb2f078f89adf9aad2b3d4a0a477823.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b17c082f249649eca733a8f0cdf9a1205c3e3d7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b9043572cabb65435627a3faf23b18d039bbcd8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b92990df507e82f96eeb7aa3ec00c01437566fb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd1a40b12ce927323594fcce61eb9c20cc5e3d4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd7b8c63a51c8639b3cf27ad09d41ae47c480d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c074afcf33e3f3534ac3577484237fcfd2ca48e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c13c4f3f645a2bb475eb1c55ce1de452f0e2332.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c3bd4e029bba76ebfc79e6522dbc8ca0bba5dd2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c4688cbd23727dd0ea9a36fb977b31aeae98d65.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c7970957024de050748d3e31cef434f582d968b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cdcdeb845e7bcdb89ef70ab2a97157d4db3cb52.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cf1007430da272174d3476d042f398627e83512.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d079c1eb36db8461fa8b861c56760afcd97cc34.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d7549e66ef309e32779ddc2a1f14e79bae53754.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d79fe8a600c3b4e0ec9aa510f8036ba2b608985.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8da8285bd6182355e3164cdc5a983375cdf0a61d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e1b48a28b71c7f4c78eb14321b39951a7c5e903.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2c587db8bd9f1b551624e0cf8b67a90245d7da.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2d5f979fc4fbd0991581a020a414f9c8656ae2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e431313fe082958d31b68d2fd0d61df0fe56736.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e50ea8dd480012cbe10be392cd26d1870e6ef9b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e675919a6c7758cbbeecb83b7ac6c62f95cdb46.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e812705ae3e452810794fa7caceef2ef6066dfb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e816fcad5e9ecfca94a6491eb2274bcc41e558b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e938d0e3ad30db201880642e57758285b2ec4cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8efb5fc2ace6839eac741c5e6616665845f43566.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f607ee20c0d92b6dbd0338f139517fdcce98d0c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f6e463eedd3e65b9c79feed3cd92ad8cbc9f036.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f7166d4bb0c1c9b9999ba16a1adbf09ebfdb6f1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fa4c40e244b412a07933d369704bcdaa6d5e74c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb224b40a7be7db0a9c5c08cc5ab05b526c14e8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb33fc20f2e85e915f1b1529ae87981dfcaf86d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fc08b4f3959a2375ac03f40c4ce12d70cdc2d80.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9009b7d39346537aa6c4a4e46b81139f603edb60.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_900d7f81c73b35ea64095d01c5d48d9190839e0a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9068ba8df8b0e977e9769f6acf6cfee6b00b9922.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_906fa8bf5e992ddc25815486ae9c24d8bfba7227.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90b17d8cba28cceddb3ef907df878aeef0762d15.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90da0d469cca5c8481504148468460c85a15c559.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90e5c56e92712d00092ba102a5eb5176a3e5d471.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_910cb8bd09d287a1566265eb1e8894fe68d3cc81.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_915b75db795dbef037b14b003ee073665fe35d3e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9163ae070075f26926a86d39e15c27e6edb1f1cf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91695dea4171747fb3cc6d910459f800608d07c1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_919ae177b7a793fa352c4f6bb8e4175f3064d814.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91a6200e36944b1f11106c02f7fcee053f01ee71.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91b9e2616c2fe0480096b1ccf0f74d584b220146.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91c916e14198f6d18dc89915e379b01070434e91.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9207a63fc55c411c73e4f93306c5ffed800dd249.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92121fd448b4640a17e1a7fe73bb7b58714c0afb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_921f789d619db6f225e8e9d646e93bbc9dc1a669.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92739f4464512feee083b875e11e11eee4f5b448.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92992be6252f2afdc368bd4baec4b8a55ae0abf8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b0770fe64e3c60b9e56170aa88bbf74802a813.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b722cdabcfaa388ccc6ccceb7e42462f3bdcd1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92ba64cdf615c1be2865f027a293cb530fc07dc6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92d841e6d783bb46d841aafd9027f92dd1b61b88.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92e53359c69bbe4d7405d45261a8a62008eb7d06.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92f9ad0fb65638cfffb3e7786f2cbf01d9585b23.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93054acb8a9508fd0f0f486367fb62454de47c39.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_931cf8d05cfa45319f4e5bb49334d35a530bffcf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93728d999ae43ee1b5a16e60b90cf8533c7d303f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937801fbb43fb6797f0425f08d13926b74d87c4a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937c48d0b7096ad6c8bc445f13f2c8c1934695ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93b885d6869400b0dc2ef1b2c2636ddfd21cde31.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_942439e4f5644a3a4630481bc7d98834b29b6e1c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94a94d145e575747c8956ac703810582c819e2e8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94aa519eb57e5797125728492d9330f5c0f0670a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94f6f9dee9f0c3825d91f4d320a5280070e60ee7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95061acc6650fc7b79fa1fe5b2b1e083555eec2c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_951343832a5bfd060c8d12da0d8a090f070a717d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9545f95c1093c60f0fb6c794636f79aaeb53b733.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95530399ad7b43d8ce2c89da24c71056f2146b18.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9583148fd684a7e6a312127e023798278415bd27.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9594816877815bc0294610ca24f986fdccdc7c6f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_960ecb3013071fb65f2d5ed4c947c4bf303e5308.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9638c9618dbf2af119e37596f7eb0fd3f8d72748.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_963986150adcd6e1d3886bacf2166de1252e14df.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_964f916d3484295b5918e2e4c22c5529588a5662.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9689ecd7bf51bcffe9f5002959bdda41c50a3c8b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_968fc75a7d102aca068e3ceb6111728c280fa837.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c129dd4c798343d6f78ab78056f0faf2f1c9d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c5e79f54b71677124f555b0ae4bfd27248d099.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96caa2056d99eb67ada498e287b4fae984397691.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96dee49ec6755006d67f0c30c65f50558bba69b0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96f1bb85dff8c97846f6b2e8796a6289bcd0d9d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_970073c70133ff2ee4737f803a0ac43801c47242.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_971a08c2e48d805b295d979b24173a04cf58def0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97246460c21bc66c0f13936d27477a9fca1c44d1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9745b04a8026a01828c5dd606d89d044d3ed1d99.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_976cf509d9c2bf86ba6ee5ded544fa8e6717f590.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_977137b371df841993c8d0584be7d83aca6add78.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97851d5ecbf02f8af623988b1a39c0b91e51533a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9801b25e0f132d647934deb395b62a3f70cc7c88.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987a617fae00fa90a1ba60937b0312c81087c19e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987f00dd759d9714693e7517dfaa8bb427294d42.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9893336a4b00b2a63f23ed7e13ec54c82d9e5063.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98e484adeddf3394d8d7693b808d83b64c71ee69.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f5efcd500ce6b9ffc14bc9877e0ba457539925.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f9a4f4d85f292b78123599a2e1798f12aa545b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9990e6ad243a48b84304b5cad0c663c0802aedfd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99ae680eed89ea93a3a94586bd5a68dbc5439f37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99e2f290b962f1617b0a9d4fd6d55c43e4439d6f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99f8352674bd6bbe98944a1c0a769a4fc028a623.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a0a70932bd587759df1e5e150b25b0126d7b529.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a20fa19d8d30654602e363806f559113218d66d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a8e04fe9432a60f86ff0369e8c1851821074a04.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a9edbe35a8fac7796f00bde836bd547044770ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ab73ea77ec20ea3bfaf995dacf93a6960ecdca0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ad1f99284aafc8d7908d062f179a056eb314925.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ae866c7db36286876818bfb718ac35204fa3843.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9afe4b6f3b901ff4af81bd4f1cd8ff19f09d0b07.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b062dd633645772e4f2caffd111af73184f7657.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b327f0fa1155f2235d76be45cd22e3db5a69429.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b4dcde1ae3446b825dea739d4295c1d1ec5c4be.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b6d08e63b9a90f2524cbfa8c5fcf8b82a1d2d36.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b73c92a13757877f34bd8a13c6fb29b60999020.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b841b7cf5da31f0c30ec42c91cc8d5bd3fedd03.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bcc791049e3ff9ebc1a9085d2d20efcc2f99b71.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bf235679af1ca03a6e601b4cf6cd0416d1c9091.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9c4fc7cda4b560040cec93f63021b529aa1ee3fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ca3b1d36d777213eb381b47871bf15dd163c994.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9cc3ef3d3b36f52089548e9dce522b0448e2c26a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d3d274058bc0a3d4d35d90669587761fdfbdba1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d6759d8855c4c6289f1f241a1628cf0406c1b64.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d69d441f48f9ea346dd8e00376a9a708da3ad87.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9dc424f0e192155e3c4e786e5b87d5a1a3e6c4ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9e51083e13aa4dfa8c969f8f916835a8e5e9ca39.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9eef1b54d5d3841f3fa6b84cca6c7ad33efa2d9f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9f0517550c7a23882b95de451e8099ea2186b4ce.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9fb389d4b5ba590baa951f17da06f0e53d2bfa55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a017be7b8bcf303b30a147f41346898acc5fab7d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02a71fdd587e47ee68e0cc76c3c4494ce06c359.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02f152e9184af0b3d77082d8bdf519dbbfceb2d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a046e888e3836b0bd3c49fec8e1872e880798f0c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0874fc5ac87a1ec487c7722bf3b1bdaa924ee09.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a094599fb5caf5e7aba728cd4713a8d0c6368a46.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0a556c9358ddd6db719458c81d2d6d822a895da.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a103cd47156a98ad2cf2c325ea00df3f1d67fb72.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a189292c81a18d21a2921ce6740f81ebf4c046ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1c71e7d33f0597fe090a3524e33e18b2e562680.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1cba1509c413c870c5d784410855ee1bd737da2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1d6ad9de7ac7993ae1923a2ef070b7dacb8c563.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a20c91b2f11bb7e5058ca7935b0bda4f5558a9dc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a21f3637624762547af1292e1b85e640b1d329dc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a225c4f1f3c7b271957768bb9235131c67afb48a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2482a64659c838f3da55f56e3cbbee1dbfe6722.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a25e2aed617e1ff31f93ae7e054313ee0dceee97.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2a715b7e9c1a576f011dfe5769c5b392e984f82.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2ef5d30a2318ae06430d17f84878800c4ca7364.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3339150d8bf9d073827738527f6cbe15b854607.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3709e4fc53d2254a03ea7660b8c72d2f47cf1ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a388a284f45f711d82a6ed87036d87cef1872eb1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ac4f93722dc314086f1b7d7b8adc687cd75f82.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3d7aa46528ee74e2bef1e87c1feceacfa55e173.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3dc780b17152f696f9b957432c2eae8fb16e85e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3f9c236d24b30bc9c3fad90cfd6eb00da835de2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ff8445ba691807caadd9f26e7eb90851875280.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a421c2ed6b295c458071f1988b9d6f7b46e8992c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4700d87a19a173e84d64e43cffabbed52366e35.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a487f617c4b84c6a0328fedac750d41dc3dafe27.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a48843d844f78690c7a45b730652f0f763c595c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4980becb0d3149fee575bad1fc3b463d08aabf5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4b7f10440331a8a88ff93ba253217c2832bcf9e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55b47aafc4340e69e300ac61a7601a5c14513b7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55c7dd576e5b1061c059e5e99aeedf4389e2d25.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a59423c095db052603d77073d409534bceef425f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5a7833f4597bb03a3e845d5580d677e97421040.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5bdc110955c05c6c6ea236a6f60266a4a6dce5e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c0109313de1f6245d2a80f8539485b849e9d55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c4dc0d70c547dbbfb661e879ba7f9adfafc2ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5d4eb673bafd81e3a0ee213da4603d88b8460ec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5e5cae764142683b70d3344cf07dd1edb7d69e2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f2f0cef657ae5e333d65ae4ab20529a43cd7de.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f8b7b2a891aa9f2ab49762eb31d835efdf18b6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5fa94bb32a80e81886b711ebfcf2df5f5405866.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a622fa57764ec746e02f6d4bd4846b48c722b807.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a62a2ab489839ea1a1bfd1b24e54a3c232ed934f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a6461d72fb6ba50e81de3f661528c96dcfdc3f3c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a64b4cf3f6706e4b4e0af4402e2263b9a1585f9b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a65c43b870705c780d734f9ef063f55cf8b3b52d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a673f35edd69241c6b921d6712dfd064d78ecbad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a71305f191f06cd53b7563971c706e8b71b19e2f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a74b0e7dd816ad08eec5a1bba6e227afee9813ec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7784b03ad757d51c234fa86ea9891f055ecd5c1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a78fecb9725ceb4bcf2aa037d43bc43efeb1c3fd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7f7553a7d2f6d42fe695cdc64423c85223af440.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a821661d8280c6e9d27f2c9ce1b3c855387b5a76.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a85d35b2fd98742427930eb536e346ffb005edd8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a4af070ee46d802cb11086b93daf91538f8a04.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a744edfa3a19d1493611df5bd0d4d59b707d43.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a92b43d374642df991edef1f6036dc898bf77cf8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93324ccf11b273ed20fd960c61df897c8890b1d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93a03b33305b33055273711ab31a5b8d8298d5d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a968df29f5ae1463706b7981b3bde55918e1aa65.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a98925d99dc484da41dd55700e151cf545cf821d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9b50c6ebb27986ce5b378d8c39315eb9cb91dea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9d2be18e2d53a5144f97dfdebb225fcb6d611d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9df9ac4ee78e5f4d5bd0567e58a7090907c61e1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9f00f270680de81df7737e848e0408cb070e68b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa1041530f794c7b8dc4a8321ea0fcdd338fff35.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa522b43c5e5ea69bcabb4c0fe28def2bd081a12.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa6d13b09f85ee62bb5018608812181fb43afc86.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa82d20635e592edbf00439294835f6f39ad54a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa996b9c843200a2ec33ed4319b48106cd7c6384.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aafe891dad43815e635f81225705ff944f990d75.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab09941bddfa9d61985b55f9b6bf0edec9bb89f6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0be5a2072b5e87f5ee58149688796b6513219f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0c3fe9529e24327686070731d0ac3ada76245e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1ca4ce061f7f69a250356f613cab00d1e2ac71.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1d7f93427095e39bfc1d986b3d7fe54073ec75.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab43f4a56c166dad0113f51b337a083f4df7cdb6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab56e886d53a1d88fada0f10f00b9f398dc54568.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab6cd5c9242f8278c8f3d9ce57b97d605c7e5a3e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab877ae2a1aab04498bf2b26b3fe99d6488ef151.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf6c6412f9853855b74a96e862935ddef66f763.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf92a5314fd33491b5eb6ebd2418b7e0d5db774.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac1ccde31b47e0e56ee0daab6403fed7895208c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac5e9aee85cd16903bf7b82a4ac10402b0b26e22.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac9382cf8bb56ffd962c99329bf67da992f8810d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aceb0641213e9a45ba48bcf72bb23845720d8b79.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad091c69d19b27f7ad50ef6311532ad8b642a9c6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad82071cc074fd30437f6158b5eb2c6df1f8c587.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad989d2ce769f20e175fa88f4082c1c25fe03062.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad9b99a194b59d3149842c15733394da275b12c0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ada016be2bd0e377fbe01fa7adb9bbb8febce100.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adae2d4f8b2dac799e03ea6f279e6ecdf66f5381.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adaef10ff2c5d89530310bdf1d53a194f06a94ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_add29e3e9828911a117dccaa5650e77805730d14.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adda7ad787524e3e47dcc1b65c41b2faea38f55f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_addb6a14043c5a4df0f5042b3770b40c4e90795c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adf160741a4f751d2f15d6eb23d4121cdca62b55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1ab1f4bbe86bb9bbc22e4774648076c321136f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1afeb6cfdf860ff08e4c2f11c922fd5bfa621a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae239476d61f48379754b97f29d7a285cc3192de.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e7253ad4873576052ec0a9400597bb7975753.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e80cb185759dd9b3eb3c67c239964b3694caa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae51b30c7e1cd30e550187458350c8db7c59a9ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae7899b1ef159ecbf01f27014601eb79b31b49b3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae87b1d5c50606430b544ed650d87df24366e7d5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae8d0bdde763e617beafc0365ec4a3cd11df6c55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebb2441e6cc1ccba4a391566e547402bcf7ced2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebd5fed34ebceb879ae3dffaf58c7c04ab5fe80.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebff7e6605b273bad844b8f70ef031625bff48e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aec87e65afa93e84d7a947c52f291c1c7360033c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aece14f7a220222eb4ce6783ec2b9fce6fde94b8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af06c0dae15684f83e15722a4c07342af9ea011c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af6ccfa11add1ae49888337e84d9c446d2f67da4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afadc4f76e237514db0bc0203102297b79730bd0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afc4b47a6fa62a4ca5cff6a7e01c9f6b371d2215.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afcafd07c1f56e74373ccf37db35976023456d50.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afccf699f593c828e11efc053b144044e45b32d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afda8f46b5ded4c2aa9d722fec17b75004b59f7d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afdab954fd111ec48721f25710d61c0c8affd8db.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b00e062055933388e37525df5766f3c14cd3538a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b01dc872c24db4db0c9179fc07e17f41060390de.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b03ab68e33844f97aa58d463e00037bc11c50da0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b04f14f829eff73afaa57a875f74ebd1e6860979.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0544a38dfdf4d81dc95894387845f48435e299a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0dd965d5d9080ed5c6a04b7eea9890f3a264f20.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0f555b74ed36f1bef8f47880b3edc6760f27788.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1766695dbb790bd614b83dc7569ad449404cc89.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b18a615e66d7cd739ce35412811359a03cb23a8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b192c55f002d8540d5f965cc4df0c2e33f4b9ff9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b19f05f6848403480ba41d37cdbf44ccca1b1f8d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1ad101ce91348266d3885afdf2996a0fdb72135.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1c5d55d47d6038e9162d32ac968ff58c0942938.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20c6252863a73341b0010191fad4c834860f884.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20e314642cf565e4f32bceffdb5c0e653ab627b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b24f91dec2029b25d0d96962528410df55a468ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b285e2f1970b78e18002464eeda63798229bbc3a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b298e213f927b518c693660110f08bdd94990ef0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2af5f5b5ee3ae964824a3e9c7bbeb5bb39c557c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2f91e937b427ecc932c0cb0c90b2c2378db0be6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3063d06723ac70c5f8802ab49c5c35e1debf56e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b31f56244076c501cb09b4b90975132cae4c4386.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3486244e0b7d6dbcaa1951e8b8883ce441c3f99.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b34c1ce348c3d9cdf6bbec9758de9d5fe94c43fc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b38a1d3cffae01332a3a9d9472ff1b2c443e82af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3a104733f678193068d8642d6560faa03897258.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3da22d3482738a8474ae15e8e5fca9020c4e195.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41735d250b5a16967281a5f07873b9cde3df4d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41a30092e8138877c1f6c25656e0f8ae2c2444e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41ea5293bc1c56efa2c4b5681d965aa6f2ce6c3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4588379eaa268d79fe8f8e4457b009f204a5fb7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b493c99888d82cd2852bfb101f99a2e6a27665b8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4a5715b550f67b8870ba66e1e6282a26cc1dbf3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4b037a2e262d11d3ed7d9feeb41b9e05427a739.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4bd2d206ceb237ed2c51f58abb5cbf96e39d07b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4ec377c44ac18527ca6a01bc3b146706a6e1e09.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4f12f10d7b968e0d8e7c23f36d3a360de74a905.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b50e6df20a2426abd3d2ff2262a37c009196024c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b513834918d5ea789e2db21abece7c2d3532a7e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5248f443a12d96815c04409a00102923c717023.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5371415448fffffd58bf014dac9f4876153657b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ac596c636df55e81293228cbc53dcbb3024e5a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ba2e73df35f6e0f7317303823fde92a42b1a35.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5bccc85f74f54a2ceb17fe3040b04fe306c53f9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c3131fb8e5a25bd4a14bc9075eb6fa01b61d02.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c7fca1f76a31b0390e92d90d569fab94d4f783.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5db3d5b1d8af89381fc4b8073f84c5fa25fdef5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b60a4e87a7aabfe3c1ce02b408522f3ec862e3d7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b6b17ae67adee9e56a022cd2a5514fb9c4e99920.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b72a804bb3c99830653d41ac0bd49943c801b89a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b737410b404a51043fc3bd503c0b107c297e4c9f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b75843bb13058ffe29251e053800c509c7590544.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b774450ebadaacf23e944aaf8ca90eada01e8a5a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b779cc0b0380e1e6a2b51fc6216fdd72215b882b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b7a03ab0b7887cc7ed0cb40e56360a8d36c0bb8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b80d0828ba6d24ea3c1a97bd9835ee937b4b32fb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b872f9e6ebe330cc1818ea82b53acec79a2f672c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b8fbc6f6e9c515edce3c7a438b3bc308b30d3857.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9385db12001110c42eff6aabad935a69ad3afe2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9559dd36a0a4f5e068a722e285f485137bd5ef0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9627f9c8d0088df0364a64643f2b5dcd951f2bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9a742ceeb6736a2c8f9439d0b05e10d3e0c5c6f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9baf70220079e6d4e87eb01a7259923d8a01e29.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9d00ab8373747a5c6b9d2f8dd50ceb14db4163c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9ed0a64deb55616646ea98b21a891c971cd98ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba145535e53899fe127987aa854f81234a9c51c4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba8b09f0aaa40a7c9ad5f0458b460d3e328f3c74.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bafbef3f13d429ec3e9f4672218998d5669d79f2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb111b7acc269f8d5e70915d3efde4c425aa5f5c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb28a4e95723e3df380f98b5ac107c4df353850b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb35c86443cc9ea38c06ebc0656306483c95ef67.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bba10ecb79ede07324e1198a71a95ff26e9eb235.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbe23201fbebed25781f249e5c77c31e0e7f9ddb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbfd025488e52b97c04995c4c5faff371b77e4d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc1ae1dddb8cc5d78196da6b26ebe66c1ce7e567.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc238fd2095b26a167b41cdec8280182330b7b25.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4425e30a0b17e8b31726817e8d3177b5c51934.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4e0f0496a34d2fb43c80ce0162ad4183f29064.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc6ce17223d8d83a64b8c96ac88223e4441a4692.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc744db85d4237ee9640f1658e0caab7648e3bb6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc79e255d25744725e2a9db9f90d5cc2b8a0e0c1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc897852a4ca992961843144f4ec4f8b86dd5e9d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcb6f0730fd09b4c6c60913425927dfdb8f83d82.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcd7ccdceb7baf3b986f2a0248827822a5f72e47.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcf8836c8cf932cc2748e313885003f0e11a887f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd064e302ff5b983dbdb4ccf51383fb29ddff44f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd28203f47b6a48e9b66302cf8312f3796ca500c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd37f4f7914805a97d5073f1ebf8a8b8c2648d31.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd3daa5f99b4522d932334924347353ce2854821.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd6aa39d0ae3c87d011610cdb5e2e317f337c454.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd80a1774d8b7d8bee4e8663392b97cda11dcbf5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd8bf7c572c1984ca3061062cf3c31d993f6762d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd9c47f3305e47db6ab6bc627fb3d80269633074.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bdab172627718278a71a93e3737ef08ad9259a4f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bde24a8dbe6add6f2dd2beb48b1280f3a84a9b2a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be1e1533fc37b41838bd37edc2b6d2f2e76ae1c6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be4dd90ccb2f258029d0156cf23f940b694cf08d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be8ec1163a01b9cd9a802d8b44669e8770c20234.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beae876d6da465687f162136231f15767cc7bb14.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beb9afccc15de7dfcb2e7d898abc0d61201de73e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec30e7107c5dce3fe6aa87d83ed96da75478da0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec9e4c0317e8d351f60258ed6611fbf365c4024.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_becc2a4d7ac045365300bf8bd45fc6d3e1e1c8b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bed5a8c5cf683f6dfaefad72c2e2f5c2f2b2732f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bef3bd014a918feddadc98eed92a7734f9bcd890.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bf9cdf86a7944cd690b0fcbbaec235863acd10bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0338fbc05f86270ded7df2bd3e2758a03961b62.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0342686e4efd26413c6719782ed13603479c4e0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c063318cb851ccaa923be12d34c84d839bc64bb8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c08095341ca7e3a1debeb780c1878e351692bee2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0a3c4ac0a50bb9b7ad764929dbee98c856b1210.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0f76aff077c28f8afd7b22f284cf2894e08a043.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c112c01d201c366bdd7acccf2e1b18b00f671153.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c11d68fe766fc753c657362673704005b538660b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c137c03bf161b2ec6a9a046fa49d7bbf80ae47b8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c197d1f050f42d82e6851fa286db6f81ba197f40.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b76bc7a17f573c0d52c07ae9ff4302662ae61f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b94e19d762ddc33cc4e94c6675d93cbde21e3d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f40c3421b9ad8cf43940530ec50bcf620058f2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f721a330b2d0fac13b22061616d7b10c0f91e9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c250ea59ab6e1ee39cce15cbd3f181047cdee31a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2541b6b5cf27de3f45f60671d36602f07ce1783.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c27b3026f1dc3056dee3a3e64bf31c45683607c9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c28de8f96c8315877031a2d56261e95fee6aef44.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c29110dd501853e87ebc122dd1971b0bb1bcd92f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2940fd05efd52bdf8a3f9aa4b78bde9b5809b34.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2a2856bf9a81544a30d535a13554e3a8107c476.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2b719893a4d8a1e71857966d399f06c0a41749c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2f04447e6a94c94a2315454e71d7d607a9fd0f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2fcced07cc194a8050bc7b2f791453b3f5b2064.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c323a4d1f24d59bddd20ed2f2fb6446627b0ae8b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c355189ade9b1a8269230232db754a3881b53168.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c35ea54eb6cd0f3756c462c66d9be956279b46ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c363ee1b087f6b504a3dd3972b96e77db02b0582.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3cfaf0d53869c373f6d0ec821b008dbb819141a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3d0eaf9399c863d672e8c08d123739bab837d4b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4015f0d0a7a5173810f6f17c00065e03fc61a89.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c402e84359b2037a29efd1d6ce7213ba7605ab25.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c41b6eda4f250da059fe0c428428219ff5a250ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c42ab428503e8f8bfa78c8cb8d9afad9f5185118.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4376ac8d82db1bc25fa273a80dfbf8b71ee5e2b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c45a5e40f6a66bc5292a56e0097c69fe37cedfb3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c487a1a9933239270f44b1e08e1cf5323521c089.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4997f79435cf64add10506acb97d0647cfbb3d4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4b34d3cb673447773f6da23e9cf52b98e99f718.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c3425fe683d35dc3335db77d183ad1620b7a92.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c6c405cefe204824e8fad1b3dd34bba87e796a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4de1bc135191f3c2aff740f4c6bb7e98da42f84.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4dec99707511cebd9188d216ee0a148d729b470.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c538dc4f65d02776875627cbd20a9c794d70b043.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c53e295b68e807774ed31bb914e4bc59312a77d7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c56aa150611b0d4800470c1493dc907082a5c23f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c581974c8b6f43f60d0af29c350d850b55c03121.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59937be2b9a13d6520fdcc922e4e75c9fa085ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59a22c6efd8bb8815887325aa0b739e260cc754.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59ab718fa23f24f09a713ac28a339208a7a5802.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5b440ca9a5196ee1e72c878c87d96934e9273c8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fcdea177734366d3bf283317a65cc3fffda611.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fef330a975002ed15670e8e7b26a10376d3cb7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c64f4cdce32189065362a502105c31bd2d9d99a4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c6e2da8b791d31f4ba05ef5f833fd6dea9e35f1c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7568e11e44ce70924d27e683190422cfae5c31d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7af2bbfac25de2853be344b9f636226c1c0112d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c806d7803d06ef8aac1d5caac9f36aafd47653d5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c80dce1a17d073259250ec0c87ade69e639ffa8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8dbfaffc8a9b573f194f9c63f1175d9725f8950.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8f6461673882d636772ae4d26e78eabcb568f31.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c919b8ed877d4244d01a17ecb948b459e361ff24.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c921a4790f982d48bcaf950123c699647afb739b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9312d7159369d13f3148a6f0882dfad6921ceec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9530e20038eb40c49bc8b045be0cf4e7e6b4eac.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c977735a36c325706bd19a12df66ed0839b032b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ad71883a19b522486706d3705700c012a6fc19.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ba0a3369d4e4eaea1c902a90e6501f232dd57c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f1e7e478a2208c4d32e2d7e6abebdc16bcc5fe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f28230817c9d9805c41dfcd4e834fe302e1df1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fb8343e623e46f01893a2b61345d1ca5928671.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fe51f982abd60e567d4238d3266fb60e45814b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca00cfdc5592b7440d72482a18781e9cf3afb05a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca1992a2634cd6674076611be54197c715ad8271.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3975efd767ddf7c12e308d948bdcaf0968493a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3d98ff43fbb80ceb82fc22ab039bee898969b0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca4c6ad28aff1976c6dd36974ec3b339aa3090e9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca5681d4e5871aacef74bdba9e368445875252d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca920c3239bb5796b1ab2fc75177eb3b820aa784.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cabb7b12cdd9b8b522af577e13232b2459dbd38d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cae6c7efbfc831e2bcfc8c1efa1a486c02627cbf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_caede7a18f3e3d5e24f6c70392413a2cda16ac15.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb10303a0b79f2710eb7c66896d3c1f8b12c04dd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1a0ce432c27f4cfa51731c3ef181bf60c8a727.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1b91c16e0255fe7a0a85638b98d94634e143a9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1deea4f4fab0db31d46a91228601f0c272d6e6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb20538073888bdb3174a8e9c32d7449072aa753.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb3d5273945c5d40cc05c2660af2df1fb7a15f3c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb4576e8ea5d59d7663f3760009a00a19e1b0667.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbd571f4fe576fdb17d5f75a558cb6747087c7f2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbe5a98163e878c7697e554758ebd0597c2c1760.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbf3e4d4d4837a0cb33b78c4f2767b1d93da0850.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc127a63d56099e08125b16939dac82f0173122b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc4ac5a18f57f2ebb65f7e356e858ab0d59b2133.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc54b107e1b557ea36b5cbaf7fe3dfce05415c86.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccac6c0e61b65c9422c7f30fbd979031698370a9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccd0b777df1328bf24e070ed4cdf8615bb2199fe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd0453a5c3828c1358360f31f5d3b7258e17fdb9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd4efcdd12184211c74e7b3f2f30fecf1041ca32.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd757a8bbeabd16a44d149ab188430f6d79ddcaf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cde0582e1aef74f9209de638b553ec0671476258.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce4714e4f33340859c106a3129993e22652262e2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5064e27ba427cb951f7e1b01328b0beb6b2b7c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5ad502dd40353312d561e9f40aa478c16ef5b1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5b5932f6df9a194ceb0d69220fba9596528eec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5c161b725becf059fb4439c668edd454ac77d1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce909cb5f96a4884caa0d2eb8c5e6bc7fa352797.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ceb9544e2a0caae2c9e3dd8bbd2c509e8dca1379.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cee81ab2e2678816c7b516d2d4c50e8cb5874c68.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf5c6c0bfaf98f6e655fc443246b81fcc730fe97.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf73e1fc0015094861ca0c1c81bacdbe0c5b8f37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfda56a4eb08b803332f25bda6209932d9624acc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfec97bdfb6fa95e057eaf5a8138853e1c0884f2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d00f65bc99ca08eba66564d34f72f2769bff9491.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d036096f49a89730f8af7e75457c88cb8ae64165.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d049a1b8f4c1c6d37973ce38593efda1de8ce0cd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d04dc4ed02eb42c3fe303342801ed3073a0dcb8e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d06ba4c996570ddab77b6ff1e2a0101b638543eb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0863830fc5d43dc6d6400280e892bb7de2892d4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d090b771a4f9750132f549c82a88b4ab00dce5c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0b09e8513646fbb2a007544a63ec9e2b04dc4c2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0daa59f5dce6fc3965193ae37d8c82a3d1834e6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0dd0165ee91c095a19ceddf08789e3576912590.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0de618ff3ea9f67b90f2227fb7fcc74ea34183d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0f63cafbeb445408c884727b473667fb479675e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d137b7b6e04e1caf43a62bd6788a75361cfa98f6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1840494c4fa78ff399c0399b3ad7ca3d22d4587.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d18727988e47264b42b4153dc82fc1a750f08db0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c0dfd19a08d61586758091370acbdc6f267017.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c25cfc437d8bd803860e39a45b2f3b9fa48393.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1d3eacc320104100bce46235fe656e5a8223c66.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d20d45aa85c0daa299da98c277cee826fe67bd27.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d257148f457557ea80ca56690e525db3a4b0ff55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d25ce4b3e9cc392ceafebc7fe3bcbe05aaad4bbc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2d08c5470a385d0160b2c1441fd1c30fff1c17c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2daccc4b3a0f90bff39cb4597f8b7e484613d9e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2dfdb42c1b380e860aa5609302f29698dd27923.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2f4b869ff23874b6bde0aab68c419108b7e69f4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d32c64ef01aa228277d031a74df51363f98aa2b0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34d6cdcd81a456125ab5e0875466c6334d8e5c8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34fcb56caa8f80404789fba0ffac447483a4d84.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3784fb4c0685d7b651f4113f3c71e050881f3a5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a23ded424200d0c6f06b1dbd0a7b7b0e7b5d9b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a2edf232786d458e2125f8dfeda8847f842afa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3af8763f289dace1054bdcb4dfeda28b0aefcae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3fce1e11aee2273620e75efe4aa0390fcde9ba5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d40569ae9dbd693c0ab3d6ba69704d31e451011b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41b6a64dd181f2efa65aaed03a3d229b3566c1d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41cd6b60a97e7071518cbd1a63abb8b910df024.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d43715cce8935439f90172d141050d78c7e76fb7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4605b2ad3e3753c5f255678abc1690b949c5abc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4645b713821371161a9925dec8a3d6c157ba1aa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4aff499ad527be5fe33b8e92547df57af26d40d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4b99af9a573df50a27fccbec3fa8e350f1854eb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4c9f975891087e6eed6393629b41155deafc509.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d50ac8e8a03f8e7ec2c6e993dd39f09f465dab57.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54ac01458df3f240e0656d82330f9de23ba9651.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54b3731883a5f8393d60d27487f8d017aedd3f9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5e82799f4452e148c3e02acd6526cf30757eb52.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5edfe3e3dc3008b928c8e6dbd50784b905f189e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d600779c17b7b21c18e1308e6d765fe02a7945d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d6149eea92f2c40c11de3b778102fcf9b6a006b8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d623b36cc3f56d1001b2d3abadd8a5628fefd014.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d63c8c746055851217a514321cd735eaf6937263.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d64b8b52f4a98801e185e2f132b2f80c29dd0c37.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66b79c4ebdcfd239cecec58203606bc123bd6bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66c30148a6fa816937f2f095802264d3dfa0273.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d703eea8075cacec4d41fee7dc4734f593ee79e8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d712f23ef88ae5d7b161d36f42d22a5ba53b6354.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d713fe25dc90b3511fc259cebf463376dcb55d84.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7145383e39dec0e346b5094401acf85ef3c2075.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d723b191785c97d284675f700a7baeb52a2eb791.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7290cc4c3036c9205e689cbcc60e7d16b97a7d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d733f4c03e338ea7c6d8f759c1132499bdcea059.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d773df9ccfc1ace90fe3afb5c00976deabedf6f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7adde8780b39f1364c572a19c3bfb19417678e3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7bda8157fb27d544e049fd7d2ec735725f1bf44.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7fae2c18645d36a181a0bdd2d8ca7a4ac0f6d1d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d82773721479613ad72e334510a248f1436b38d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d867098db97b3f26e71a151c63b74260bfab21f8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d86e4dcbe9c4cac8f7c8c5d97ce384ae0cbdbfbc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d8901a63986cc28ef24cab012b32114851a8c1ec.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9061c204d8a85c974676f4438994a0be9d69a60.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d924ee32b178b6bffa7a71603d6e2818f66177a5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d937609afa8e21a761dad6b01ff3f26346e450fc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d95835bc6f000d3a3379bbc38d90e83dcaf867ee.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d992eab7de49033f5480c5e86a69e675db0d2a19.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c23b7f8fcc4e4f4c81f5f00cfd345b98df2e0f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c3e27b522320dcca5ee84fa534b03aae2bfea9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da07d8b5666423da30a95e3b2cabd3839d200981.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da29a515d14dac02066bcd4701285b9916b43cf5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da6afccdee4107507a64323e17bf12c46da2b92a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da74887afedbd67928fe4d596709f9ff92530611.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da822ea727fb3543e445e4000f7e6ebb946d6a3b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da9f6e1d59132fe96709490af25bd794f267851c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db0d0cf55d90b3f3c9eecada1db93c420f34b1ae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db5016bff9e5dc37184d2b9417eb351c7ea1c322.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db85839ee8d464c5a81b8dad9839f5e0f4b467a8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db8f0bd93b352d28c5b6d78f4332026993f0bea4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbae1670fac6812b2d2cbad973e4b475509ea504.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbb06b43d5d65429e23cc717448cf1fffb0cfd74.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbc4135fce01e8731fec7a78d0cc0fdeeae28b90.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbcea8f7b5930abf76eecefce92d0db785d2df5d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbde2ef18e2174ebe13a6e7c8c2a6b05a6612047.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc039d422a57c159ea4dbcc867d766ff1b356a07.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc08afbff5def8bcb4e823657ce01f57c9dc77c9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc184767d723f4995791848cdc68bd948408204f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc1a7f9b1afeba6690fdc0d0d1755ea89c805573.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc34b6ef496d4e0d8fbbe10731d4a7b1c136c036.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc3d625c5ad3e871f5a727ac946df642d988b9ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc4d27535b9570b8f4b790470a83c1d0a9a2b6ce.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc5ba6d73f331c76e696953606c5b347b6a46f3f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc62a8db637d32e7dfdb2521cbdae6e1fbbd5fd1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc818f3ce244743cb1dbff9aca399df90742a6d0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc91797c1474a368e9cb056b50b4629d7736c3cb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc9e54273c0ea2358fb573a7d918aa7b09fe07f9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dcf815ef540060cc7ed43e1c57a28e1d080c5621.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd10bbf37503bbc92af82bc3487989b41b20ca85.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd11806cd2d3ef1127f676b2d98bf8fff2a1e5ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd35634440edb25cb095800b882c70aaceca1dbb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd67d442001d2b167e70e8730abde4d4461b8569.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd9494d9ac35eba6794a4f9120d2db9932596ef8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dda8d021381083bc48b7fb1840729254dd8e5137.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddcb1cfea1b0dbe50a02252cba99428fd977527e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dde93ffe7fca311e136e42fbcd12b05c9fc7174c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddf5339054f47d9ed6cc7f9e66ab21ce3bccf3db.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de1ff66d2aeb47d2fdccaa4bb6b9d066b380c99e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de26a187c4db06115072a5132e1166b5b03368b0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de36bc309877917a18fd21acb30563c7e2f233c1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de5359f0fba3da9dfed06ddbea8fe2a33a9cf40c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de6683d175affaa5ff261ab8503f64172d8eba8b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de7eb562a7eff31d589e12945d80233aac202ae2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de85901d66dc04b1143bb6404445baf65693b781.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_deb9ec2cccab94920e40f62a1f0f094acd919d07.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df0b2bcba57e77d975ec5304fc50cbd09cddf4bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4bb75ca79f805a81fbad750ad22f6d22b0d8ff.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4c9eb48da49a61957537270d94e56cb4e426be.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df5b1c6758d4b8540158299dd0362297083084c2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df645b3888dc8d1df50c47c0d75822eebd3eb019.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df66feebc9a0dcc508ce002c255154622875e524.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dfcd68acfca68d1acac94f493e25be0ef20f209f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e02a198f23c409b715761b702d7b0e6e5992701f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e035773419a9b3631698a3d375d829af55f7731e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e088f0f7363804cf5403adef70828ab32d09a02a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0966fa1ff013e477b1706928de6cb7f8587c154.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e09d9baa269dfbb30b714389d1733be51cc419b7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0e48d7edfe9513f24ad9fae68cac3aa940b17dd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e10f47a44400de385ddbeb99475b717c5646fb41.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e11a3b7d4fdfed64e64f7a95dbc64eff541092d6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e13b86fe4e153e0bfa8d1e75f3641fe32b0c5149.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16075c3a5fcfe63ba12e854bb1fed6873f014ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16edb824cecf459a8ec51b8dc74b1e06369aceb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1c1a31a1d8556cbe0b6ea76faacc78855108539.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1cc934ba7baab1a2eb062df1e4ee5066e9ffbc3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1d85ad2c9d197f501267fe0804e6985802fbd18.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2762543d3380185e304f84749a70db1b8d3dd8c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e28fd64c2f2b27577109a984e6ab82f5f0fcb296.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2b629c37cf94134693ce455b8c88b72a39df7fe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2bf6805a489739abb77c13173d57723e9304afa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2c9f955f227430c6224ebc347649386be7f01eb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2deafd2f36cee29109fb824e0135407453adcfe.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e3015c5d50481547aa5754d042d9d7040cf1c7ff.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e307a1b0d5a8f94e0a0f4032f401d20b4b643523.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e334e691714f0b99773c2ac515ed82de0f387065.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e34b7e452a4db74189334697e3a240ad68085f0e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e389d0e4442cd8304081892ddc75043e68a6398c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e465193d97d43237c22c04478ca5833011d8dc8b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e477abef05ff37ec27705eda51896e2aa3a04966.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e4d9a2396ceccdadab24602f30e9070901a76dc7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e502730dea6987e2c038446c448aa08bdcc23113.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e514c6b4bc75d95a150104a17972abae77cb47ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e52e3053f30f780f346fa6b7a836ad2554cb85df.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e56757fb17f5e94a6ba1fb14540a68c36d571159.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e578ec9e09d3b78dca6b5bf0be1538657f02f319.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5935fbda313d3518f142f43d46f56c600f69286.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b2bb9f8466de1ad5210e4c39ee7b8ecacdffa9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b65fc519ea7cfcd19f7eddbc3acad6842ff558.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5c5079636a4a31a849ce8a5af89d50330a74628.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5ccd5f7ddc894b2717112cbfc766804e02b7bd1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e618fb4e529104fc90069c8779ce5463460bd516.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e638053e01268a4c5883620fc6a9901951e2e01a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e639a1e84faa98477b05df71d363b9ff0f9b2760.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e68a9e05debd456a9975953f7b0d510e7a0f6978.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6973d75297bd2c3432a7c88e8a9ee1c9ae693bf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6b53fb8d81148ff384d31a703bb4c2e7a5a33af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e0ec1db1ea308e226f675e68e29b839e41b252.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e6b10e73733716e71ebf5a53703fb935fc5e02.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7153f9a9b0b7c54ddf2debbe297efcffbb4fcfa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e73a776ae4ba68c23acab1a5a6381684051738ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75c757c67aa23cb88e1aced6fcf36b7b28391db.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75d492ac3a6ab75648056bcf26250a4aa929cfd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e76879f8ff4796f48ad87ff8003f4f6e6adca9a0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7ae1294b6dea5c8b93c2b814fa7460c4047105b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7b2eb64b66d46359fab44333c2c484f4c9dd5de.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7c0a99e949baa5f3a7ee2d6e84427982f82f76d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7d37e7ee96c392fa24c02a9143438a3a7d05741.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7de729aa50c10d8101ef504138c3769e3286753.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e83c604d1b8260958becd1c7c209745ff9151715.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e89bcea4393593313d18a4aa6dcb44cd75bc828d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8a9427f34bbf5ddb28a39161acc36806e68f2d0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d8fe5f4f8641998b8b805a20b2ca92d019ee59.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d9b65558398c0c10127b560807578ef117d7ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e907e8d1089557dfcc95a05160be5092e9119a53.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e95e3908479965856843317c8b0c42a6961dfd23.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e986d5f8d5591f3e0f1cdfad19c38c420fd93023.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b04e6d5527ba0b8089ba8bdd264e2d5759338b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b53fa68641f45baabf40b7cfb8b35a9a1b9c7f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea077e68dbc1bed2dd20a5f4dd35e0cad6330ee4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea591185b1c5f521023e250a26f742984255b241.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea62567e9ea16771d8445464c38f5a2931cb355a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea6a6d4cc262ea838dbb83ee747112f95fa297bc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eab6cdc59bf216f7045f0cf5f221bb91ec415cd2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac353f963c52624cf79e82cc2b2c02eed94b677.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac5952f46f4f2bf06257b00661774eeed48a323.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eb278488b2cca114adca5e4614d86f92447f937a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb241b947a0adfc8e50c5d71765c14af24593ae.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb9abf5b09e63cbe76390bb46ff7cbefb3141f0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec171210efd217c07d357fcf42e5372ad7e9abab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec3deb1382003ac010d9bc1c59d1878d3ec7a727.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec51d24ab5f24e003ed6751ae8ae5b327892b15a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7ec8d547ee9713aa3b5b667f22cdcaa8f62b2d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7fc24902b1ebd8f2bf8088b0ecf6de8be8362d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec9f63a538940e5ace02ae5b5ddc01f730adac4d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eca613eaa8471ad7da66d2f8f2b8e07f6e02b467.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ecd7dec90b3c62bf3a30bd75d3c6869529a06b01.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ece60111633db08f765b3c7cd5cd768cbd030255.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed37ba962e0288e2840eb0925d016b5a7e3b3164.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed6bdf67720e938d538a867548ac3579b8238169.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ede81dbc4cb208ef6e684c76ba1eb451d37fe10c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee1a43f2210a8d1e5623411c95c33424cee5e747.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee239db5a67c23a383590a651f0d8a0be43a13c7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee8e709eec7aef1fa681053c6d2969a5ff18c45c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee974931e65d6b16b7c868d462b95dcae20b7513.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eeb0e96b759e18cf703cfab0cda1385726f6e0a1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eee408cf9456ff977aa7d12345e9b2f1e60639f1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef2ebb4a86e7ed0001de9c5e607b66fe8877409f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef40f0acf1885096efb840ec5600ec421c4db331.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef5421703cbfa63a58ec02701e245d479a1fbfc1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef7cc2aa1ffd38298b52764a93cd1271b4d92f8d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efaa0cb33c71cb8ca7b83dd0e7a6c7b01f6b50a9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efb9e7d9af47cdf79f15f674f8976c05f08b0ce8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efc6a7b25710f0626c3af534111b161e1459d2e1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f01468c62c878295443981662e037ec5213cf7a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f020134822739be6fa0bb3d98e9dec79f025324a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0209426a8e6bfeef7d8ae7b16db791888142298.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f028af9e5e3c25800dde938e991aaab4fc1d64aa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f053c9c32518b895daaa3521827f37af78836fb8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f069b38b26c30bc770f74c856e47eb498f5818e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0cad48d9bc80d58705ea60eb2dda4baad68cedb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1246d1013d954a9316f4432c986d3be9459c548.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f12f1f1b679cabab04218037ef370d2c7e1fe332.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f15c41ddb04ec7f80235bb3db19198dd6b699713.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f18c74becc24a93427d9c0838784e9b6caad6e81.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1ecc90ad7b86791a9e6f73a582aeff30f393804.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f21596e8c608a795ff971aea8e199db9e72b65d7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24bd5b92ce6bba640b8ec6b4e53fe35902c5572.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24d42e820adc1a26a428d59df7ffdd7f8580176.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24f26e45d5cf567d29fbe375fbf8abdec39186f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f25b87c435bc5d7d85d738f3fdf68947d79f5a77.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f280e1639680ac1e5830a21f921bfe2cf364ef42.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f2da112b1e07c44fc8a7f19368da203f6935049c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f30316cfe49323638f71ba688dd8ff9b2266b335.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3193ea266f3718398bc5622f8bc7042c3527a42.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f34fdb8294257d951dcc9c4fa7ecf1192568b91b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f36aaa63ed42a578b953ebd614318d44cf44e8a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f395bec57c3b2e6e169134dd8d20b287d7405134.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3bf7ef503bb026258b3ec3d82d3ef1443046964.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3d0166931e4406873d8f552a5d5b61fde2391a3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3fd08d56f8a9be1a8dd104cdb1ac58e283b5064.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3ff73f82aee3184849d04c2364eaa45c6d0de9c.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f42cf0e5fe479690883507028748b0cd3dc83cbb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4658c32d562f9d60c5ca1262a2e0df2375063bb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f48f8b681a405bfeba5aadaef40f32367ec5cd2b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4900c0a5c0d03dc17d7a907ab40652d9920e756.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4a6438394dd3427f29aa0bbe58ad1f797c3c38d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4b87f983a5e84582efa1663f84da76cf60b5f6f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4c803838f5644ccc6f04f7c8a6233fed0b6639e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4df1cbfbaf67705820f125b474469ad7ebab0c0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f50fa4ea674a590d0a817367ad9915a5fce20c51.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f51f1a11f778d99a00aa5959a3e58a41fcbfb1e3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f525b59df454ccf53da6cb201e0aa8d09f52a2ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f57f84892e2a8496169b7406e63b0d4f5aa63aaf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5803aadd93e33567aa6b23100ce4fbb6c040dd6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5f1797f6b672a55476348571ce17645c8a62869.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6566441ac3074578cfe45758ba0583c0da0a5ab.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f672bf80a78885428b2c02e522426470653a7351.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f682399cd6412fed6a1141296a7e4d42078f7b29.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6856ca950bcf173571766c3f04de4163be0402e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69548d6cced86c21c09c6475237a0cb926df0ed.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69878f4ca8cfe6b8d8748766f66a1ef8eab20ad.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6f102a388ffb05c690a20a29cfe0b35a35eed61.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7035f4bfd8f2f427720a07e3c311bccc1dba683.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f71f96ce4dcc7f789a8ace73c230c203b05ff6dc.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f727911254904ce4341e4ff5f8bafc430b8cfbbf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f731289837f915e2aec1bd01eef1b3c1b099864d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f79def2b4edf6d18f6ef1d6b141f9e0435441f6a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7aa9c39b06e55bf4bc9f9a2a0fb075c9d4e69ce.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7cf08242b3fb1c643d4149bec985b667b9d28fa.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f851da732f397624717160f89271514bc334b59b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f861d8693f82d22e2c5b1abbcbae5f30f4433e5e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87790f260630f312b84888dcbdf849ce130ae59.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87991cb7787a29d3ce4711b4ce04c5fb6a14ca9.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f90410c26d7649e21e2ae5e32e7af89d84d2ea70.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f92e9a82c879051d6fe3c42108f8a574187704af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bc23b8a4f1e0fc5c5756c4e1c835bf59dea09.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bf815b520a9d9e17b43bf9d7fb870751b6225.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f974b12e83e214c30995a25631d37df1478927af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9824fb32933b27501ae8a7f43f460a2dda6a814.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f98a6b193fec3203eaa75819f6b51aa45a48f212.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9c58761c927b222112cb5cb6c9acb5d3c915785.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa16fa84278b489af253b52839786f94aeeac36f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa62a97675719c2e8e9bb97361b92ff1c7b9d2ef.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa85f869a92f0482605e52019828244b12e12b44.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fabdc143c29d5ca50ab1e96a814bda6d05b0d5d2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac5a0f98b94530befd634891e42c424bb86f0e1.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac99c3c82b77946f6844699d2333cd532a78a26.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf56e45b2240515e97fc1bfd552eb03b6de5094.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf686067fa433cea5e95dd523846dc881eff635.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb2fbb135d59028afcf867c2cf08edc323565528.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c15452f9155c5966990f09432e5eb7e28e785.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c5f8fecfbbe16e6648becb3b5ca89fa3d8a94.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb5bb49928ce5515d7b297d5eadd4ec70a22d60b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb79e1f9231692d736dbada062ed6821f34927bf.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb9477a613665cebcad781389ba7c5a36f51efe2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba36678d5047ded97ee7a7ba9feb9569afdb6ea.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba47fa8d9b5375bc408af68b67345ab9dba2eb8.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbea85b766bf0c918ee0baf24dffc6a5563d5105.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbeec221cd63adaedceec39db41ea942f99f5133.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc030b61ae20c4b7d9b2d10930a17e01e9e93328.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1790325b59bd44b0a5f6cf9723a25fd845cba7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1eb85a00017efdc610e4259d2abe935b85304f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5841a729099340d608e31023acbeaeade3e886.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5ebf0f2200f37ccc0849e0c3745f6e2f00111d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc7b0916744b593435d8e1e7b6d874d760cd5e3b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc86c13e933cba40553ffba31d53aad27415ce4b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb0b08e29b2e1bf181fceceb9dc416e54f52b00.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb6ef39c3db49f26f736d6c9221dd825409ec4e.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcbe827108d252b2f5847fa8e132c9c3e56a90a0.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fccabea88b8e290688c1b360875d228e6fdf1624.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd10a3b937e9659716925e39a01d794914b08e26.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd19d7614f2ed5da21a52ed172ef62cc07c9c01a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd26e43ca652e6f58ff48c356165aa4349833b55.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd345632e0cae0d549ba79626a08b1885711deb6.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd3558b4c7a667dbc365c4c2ceda646975408f51.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd614df484b263deae3b3c20adb0ce7b62eaa651.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd9cd1305633b62b68fb8474ce021f639f8492e7.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fde12cd366d6850ce26afce98e5076b695b4875b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe245e9ea974adce2b9807d33b9ba12d916eaffb.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe72cdd69944d2d765478d4aed13066a02b76f6d.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe8b8c3525fe86a20a2d6c69585f3e36c16caabd.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe97b7adcd67ed9bda8831d1f3f1ca7590c6d251.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe9d98dbec5096a89b116f85675af772f023014a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_feb5e77111fe1e20bafdb83a925b5faeeb6214af.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecd7501265b4c4dcf015485e63e2324304f70d3.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecffa403b3631b1957e1a9a06f18fdb3b4eee5f.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff453e3bdc9752cb7b81f7cc3056325a8b9a8ad4.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff6862dbdbb20bc63a650e1f93e9ac169bb702b2.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb5b7349a671b182d73c8016590f26fe06a4cba.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb8adef0cef91a86f36872407fea35df90e8f2b.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffc6056d9fe125a4dbe08c1d86354e51f7daadd5.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffd868d49abdb769ab82c21508d655daf54b8a99.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fff7aa57cca501f221077124359a589b3a6f9d0a.hip delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fffbfcac254e33926131a71905e93f9cc0aef89e.hip diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00042c36bc588e60a7c8a9ba297a8a25d8ac0660.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00042c36bc588e60a7c8a9ba297a8a25d8ac0660.hip deleted file mode 100644 index 82ce69ab23eb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00042c36bc588e60a7c8a9ba297a8a25d8ac0660.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0029076f83a3dc695a167beda6fe19230a2b114b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0029076f83a3dc695a167beda6fe19230a2b114b.hip deleted file mode 100644 index b9b428a079f8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0029076f83a3dc695a167beda6fe19230a2b114b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_006c417a52a1bd7c55e45d111483d26f4480caeb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_006c417a52a1bd7c55e45d111483d26f4480caeb.hip deleted file mode 100644 index 3ed1294bb7b4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_006c417a52a1bd7c55e45d111483d26f4480caeb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_008f2429c678d13386a06e8d8b15c4b480940ff3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_008f2429c678d13386a06e8d8b15c4b480940ff3.hip deleted file mode 100644 index f7c9761d7210..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_008f2429c678d13386a06e8d8b15c4b480940ff3.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00a2adbe938d458d51ca5fc4020667a215b672a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00a2adbe938d458d51ca5fc4020667a215b672a4.hip deleted file mode 100644 index fd3344b077ab..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_00a2adbe938d458d51ca5fc4020667a215b672a4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_012c0f480917c329f4c3c6c666cf32af2d82b294.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_012c0f480917c329f4c3c6c666cf32af2d82b294.hip deleted file mode 100644 index e74476990d44..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_012c0f480917c329f4c3c6c666cf32af2d82b294.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_014c209d5cfc6b965bfd78c64bf132c0154e32be.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_014c209d5cfc6b965bfd78c64bf132c0154e32be.hip deleted file mode 100644 index 8ca5e5d1cded..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_014c209d5cfc6b965bfd78c64bf132c0154e32be.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0153ec18d3ded0f8bdc6459ea5757ebd94d9faf2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0153ec18d3ded0f8bdc6459ea5757ebd94d9faf2.hip deleted file mode 100644 index a95b27e98885..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0153ec18d3ded0f8bdc6459ea5757ebd94d9faf2.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ac1a2ecf9a487809e46faa92e267df2d47de91.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ac1a2ecf9a487809e46faa92e267df2d47de91.hip deleted file mode 100644 index 54a4aa16302e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ac1a2ecf9a487809e46faa92e267df2d47de91.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ca79005067e20e4eed5a72ff9187cde702cd1c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ca79005067e20e4eed5a72ff9187cde702cd1c.hip deleted file mode 100644 index 5244d6b6336a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ca79005067e20e4eed5a72ff9187cde702cd1c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01cb354dddef6e99e4ac843f2adafcddfc58d520.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01cb354dddef6e99e4ac843f2adafcddfc58d520.hip deleted file mode 100644 index e9cf79f84b3f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01cb354dddef6e99e4ac843f2adafcddfc58d520.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d12033d59ce2799a2a024e5d9232325ccf1320.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d12033d59ce2799a2a024e5d9232325ccf1320.hip deleted file mode 100644 index 5becf11a95a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d12033d59ce2799a2a024e5d9232325ccf1320.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d3b034a2d8d0b83c0aefa4faac6c3f28ce737f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d3b034a2d8d0b83c0aefa4faac6c3f28ce737f.hip deleted file mode 100644 index 49470c9aed7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01d3b034a2d8d0b83c0aefa4faac6c3f28ce737f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e2428c5447aa9a78f79f73f31cf685c586872d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e2428c5447aa9a78f79f73f31cf685c586872d.hip deleted file mode 100644 index fff9e8ea4f6a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e2428c5447aa9a78f79f73f31cf685c586872d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8aedb7b7d77f44a46b2e9b7a826f245aaf4a7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8aedb7b7d77f44a46b2e9b7a826f245aaf4a7.hip deleted file mode 100644 index fdee95116f79..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8aedb7b7d77f44a46b2e9b7a826f245aaf4a7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8f0df0c54ce619e5b66441b3c96a5e18b05d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8f0df0c54ce619e5b66441b3c96a5e18b05d6.hip deleted file mode 100644 index b2c359694a71..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01e8f0df0c54ce619e5b66441b3c96a5e18b05d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ee0083f6df962c4a754cd3295b1a436c590a0e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ee0083f6df962c4a754cd3295b1a436c590a0e.hip deleted file mode 100644 index 4dda2f0d5748..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01ee0083f6df962c4a754cd3295b1a436c590a0e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01f74764c3c3284fdd1b67d0ea781c2261ed0de6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01f74764c3c3284fdd1b67d0ea781c2261ed0de6.hip deleted file mode 100644 index c09ec0391119..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_01f74764c3c3284fdd1b67d0ea781c2261ed0de6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0225857454eaab2eb664aef7a0849ce12c32fdf9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0225857454eaab2eb664aef7a0849ce12c32fdf9.hip deleted file mode 100644 index 08b088d8cb3e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0225857454eaab2eb664aef7a0849ce12c32fdf9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0237c76137df14fb808ade8bd6837045f2aaa5c9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0237c76137df14fb808ade8bd6837045f2aaa5c9.hip deleted file mode 100644 index 0cb05acfe8bf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0237c76137df14fb808ade8bd6837045f2aaa5c9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0271bd8b7c270e1593871b638288a4923342c446.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0271bd8b7c270e1593871b638288a4923342c446.hip deleted file mode 100644 index 325e4cbe881b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0271bd8b7c270e1593871b638288a4923342c446.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02d88a03cd3966dd0cff550065f58c3ffecfff6c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02d88a03cd3966dd0cff550065f58c3ffecfff6c.hip deleted file mode 100644 index 0e578836ab41..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02d88a03cd3966dd0cff550065f58c3ffecfff6c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02ff94e3c787a7b06ffc90c25777fa74f225e32c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02ff94e3c787a7b06ffc90c25777fa74f225e32c.hip deleted file mode 100644 index e7cd9d906584..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_02ff94e3c787a7b06ffc90c25777fa74f225e32c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_030a759dcc92028b4c6f317fc230b98cb929e806.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_030a759dcc92028b4c6f317fc230b98cb929e806.hip deleted file mode 100644 index ae0bf3c79c8a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_030a759dcc92028b4c6f317fc230b98cb929e806.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_031b12f9fd94e01aaff2c0da4f35f346822087e4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_031b12f9fd94e01aaff2c0da4f35f346822087e4.hip deleted file mode 100644 index 996baf438ef4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_031b12f9fd94e01aaff2c0da4f35f346822087e4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_036887daf6cc092e7422a17882488e59cecfb643.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_036887daf6cc092e7422a17882488e59cecfb643.hip deleted file mode 100644 index 6287aac677e2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_036887daf6cc092e7422a17882488e59cecfb643.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_037c6c80fcec3eb8b0bef50ad6af6d27bf5447f5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_037c6c80fcec3eb8b0bef50ad6af6d27bf5447f5.hip deleted file mode 100644 index 91c2ab28c29f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_037c6c80fcec3eb8b0bef50ad6af6d27bf5447f5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0392491c5a6dfc742c2be483419a40f6a7a7ea56.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0392491c5a6dfc742c2be483419a40f6a7a7ea56.hip deleted file mode 100644 index 405216343311..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0392491c5a6dfc742c2be483419a40f6a7a7ea56.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03a71615a088e972c998f9c7cb44566c268c5124.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03a71615a088e972c998f9c7cb44566c268c5124.hip deleted file mode 100644 index ef626eecd7da..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03a71615a088e972c998f9c7cb44566c268c5124.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03ff035717140f7385282419598cb4fb2881ce8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03ff035717140f7385282419598cb4fb2881ce8e.hip deleted file mode 100644 index 805b8dc0a38d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_03ff035717140f7385282419598cb4fb2881ce8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_041a0718891596ddac1fb0088637029233ccbe60.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_041a0718891596ddac1fb0088637029233ccbe60.hip deleted file mode 100644 index 3e4a7c904e36..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_041a0718891596ddac1fb0088637029233ccbe60.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_042a156e9eb935555ab14a84461959b466c2fb5b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_042a156e9eb935555ab14a84461959b466c2fb5b.hip deleted file mode 100644 index f6191ab20bc2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_042a156e9eb935555ab14a84461959b466c2fb5b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04641230fe9a50a221047f7a1df8a370f72805b9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04641230fe9a50a221047f7a1df8a370f72805b9.hip deleted file mode 100644 index 59ab7b1f2584..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04641230fe9a50a221047f7a1df8a370f72805b9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04c363e11d202c6d2f4bb753661c5a2043edc0ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04c363e11d202c6d2f4bb753661c5a2043edc0ad.hip deleted file mode 100644 index 7269255197c1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04c363e11d202c6d2f4bb753661c5a2043edc0ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04caeecbc01667ec6f5599358a0a20423aa9a00b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04caeecbc01667ec6f5599358a0a20423aa9a00b.hip deleted file mode 100644 index b3613e705745..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04caeecbc01667ec6f5599358a0a20423aa9a00b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04f39b453505f68a5091f68b1c3de48369d1e7ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04f39b453505f68a5091f68b1c3de48369d1e7ea.hip deleted file mode 100644 index 3518c4f0ee3b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04f39b453505f68a5091f68b1c3de48369d1e7ea.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04ffca078cfab8bc6c4ccd1cc8994a1bb4a88ea7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04ffca078cfab8bc6c4ccd1cc8994a1bb4a88ea7.hip deleted file mode 100644 index 8d967cb643a6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_04ffca078cfab8bc6c4ccd1cc8994a1bb4a88ea7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0502e718337eab7d47aa65cea7d3c5f641484520.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0502e718337eab7d47aa65cea7d3c5f641484520.hip deleted file mode 100644 index 8054f38064ee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0502e718337eab7d47aa65cea7d3c5f641484520.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0513b2f3bd8ad51315aadb7f63737201898adca8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0513b2f3bd8ad51315aadb7f63737201898adca8.hip deleted file mode 100644 index 7e5bc574a2fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0513b2f3bd8ad51315aadb7f63737201898adca8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_053981d9e7af2ebc0f91e61ac5e25cbe68c95bd8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_053981d9e7af2ebc0f91e61ac5e25cbe68c95bd8.hip deleted file mode 100644 index f475e4b4883e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_053981d9e7af2ebc0f91e61ac5e25cbe68c95bd8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_054fda16133a0d25077967b05425f9128e1fe1a5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_054fda16133a0d25077967b05425f9128e1fe1a5.hip deleted file mode 100644 index 777a1c0f9e95..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_054fda16133a0d25077967b05425f9128e1fe1a5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05538339c21c92c53d237865d72debaaf2ee5075.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05538339c21c92c53d237865d72debaaf2ee5075.hip deleted file mode 100644 index 6f7e82be7e1e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05538339c21c92c53d237865d72debaaf2ee5075.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0595316f0dfffda03e5296b959a49ec3f3c48d67.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0595316f0dfffda03e5296b959a49ec3f3c48d67.hip deleted file mode 100644 index f46f2b4f372b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0595316f0dfffda03e5296b959a49ec3f3c48d67.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05dfe927fd64a564c5fad537fb7c41ee9c94c2c0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05dfe927fd64a564c5fad537fb7c41ee9c94c2c0.hip deleted file mode 100644 index 486f54cbef9c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05dfe927fd64a564c5fad537fb7c41ee9c94c2c0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05e60b3ab7477f9edc8576a8bf43e3a62b8d5ef8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05e60b3ab7477f9edc8576a8bf43e3a62b8d5ef8.hip deleted file mode 100644 index 90659871cb42..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05e60b3ab7477f9edc8576a8bf43e3a62b8d5ef8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05f794c7023cbb7e35f1fd1ae45bd2377bfbc520.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05f794c7023cbb7e35f1fd1ae45bd2377bfbc520.hip deleted file mode 100644 index da8fdbd723df..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_05f794c7023cbb7e35f1fd1ae45bd2377bfbc520.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0628931bf5cc1daa6e106cf60bb21fa1aac6b1df.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0628931bf5cc1daa6e106cf60bb21fa1aac6b1df.hip deleted file mode 100644 index 85d85fe6f34e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0628931bf5cc1daa6e106cf60bb21fa1aac6b1df.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_062c8c3c1cf6c33af4574099e9b6ac54a55ad776.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_062c8c3c1cf6c33af4574099e9b6ac54a55ad776.hip deleted file mode 100644 index c1bac538b616..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_062c8c3c1cf6c33af4574099e9b6ac54a55ad776.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0682150e93f547e00f13cd8984779bf49b91e50c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0682150e93f547e00f13cd8984779bf49b91e50c.hip deleted file mode 100644 index 3ac2e48e176d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0682150e93f547e00f13cd8984779bf49b91e50c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_069c663be0267c009be4814e9e4e7c13ec999411.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_069c663be0267c009be4814e9e4e7c13ec999411.hip deleted file mode 100644 index 51937df035c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_069c663be0267c009be4814e9e4e7c13ec999411.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ae52ef937cc27c544e32025ea0dadb7fad982d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ae52ef937cc27c544e32025ea0dadb7fad982d.hip deleted file mode 100644 index 04a4bd9026f3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ae52ef937cc27c544e32025ea0dadb7fad982d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06b74acd9abfbd1c4ec2f4c718eeb92a0bca7bab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06b74acd9abfbd1c4ec2f4c718eeb92a0bca7bab.hip deleted file mode 100644 index ce8d49011352..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06b74acd9abfbd1c4ec2f4c718eeb92a0bca7bab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ba94794a14f0f0022af6f5f3c16e1e16959d4c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ba94794a14f0f0022af6f5f3c16e1e16959d4c.hip deleted file mode 100644 index cd324663db2e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_06ba94794a14f0f0022af6f5f3c16e1e16959d4c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_071751b1012b90f7b57f8591cd06ae1fd27d9cd3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_071751b1012b90f7b57f8591cd06ae1fd27d9cd3.hip deleted file mode 100644 index 6f184966dbf7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_071751b1012b90f7b57f8591cd06ae1fd27d9cd3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0766e7aa4b263a811408b285213e47176ee2bdaf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0766e7aa4b263a811408b285213e47176ee2bdaf.hip deleted file mode 100644 index 7b86f6cf2955..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0766e7aa4b263a811408b285213e47176ee2bdaf.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_076b3beb57b30afb30636f948e3989b346b38d20.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_076b3beb57b30afb30636f948e3989b346b38d20.hip deleted file mode 100644 index 7638d922dd73..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_076b3beb57b30afb30636f948e3989b346b38d20.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0789852b0cd3cc030c78b28f2fd5b6b0546382a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0789852b0cd3cc030c78b28f2fd5b6b0546382a4.hip deleted file mode 100644 index 49714c513271..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0789852b0cd3cc030c78b28f2fd5b6b0546382a4.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_078b96ad691a85eebd18586db0b62b8911016d9c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_078b96ad691a85eebd18586db0b62b8911016d9c.hip deleted file mode 100644 index b2668f3d0d3b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_078b96ad691a85eebd18586db0b62b8911016d9c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07c3fc96d2bebe546dce6ebf46e5c7a519959599.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07c3fc96d2bebe546dce6ebf46e5c7a519959599.hip deleted file mode 100644 index bb4930d972db..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07c3fc96d2bebe546dce6ebf46e5c7a519959599.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07ff04fcc273e469737512893ea3fb5876ac131d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07ff04fcc273e469737512893ea3fb5876ac131d.hip deleted file mode 100644 index 68d9e794357a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_07ff04fcc273e469737512893ea3fb5876ac131d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0801c56831b4c6428200db6318638a2129bb197a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0801c56831b4c6428200db6318638a2129bb197a.hip deleted file mode 100644 index e0dd05a50330..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0801c56831b4c6428200db6318638a2129bb197a.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0836d5dfc0f939ab9a4064b403339373caf35b56.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0836d5dfc0f939ab9a4064b403339373caf35b56.hip deleted file mode 100644 index 188be6dbcaa9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0836d5dfc0f939ab9a4064b403339373caf35b56.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0842c4e3aabdf55405b3ce09ce1899245ddf11ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0842c4e3aabdf55405b3ce09ce1899245ddf11ad.hip deleted file mode 100644 index a7e145637b17..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0842c4e3aabdf55405b3ce09ce1899245ddf11ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_085722b43cde5f37242edb071f639da7c4a0bd48.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_085722b43cde5f37242edb071f639da7c4a0bd48.hip deleted file mode 100644 index 2b1591bea8dc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_085722b43cde5f37242edb071f639da7c4a0bd48.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0878b9aa31429d23a93cd953cc6a2fc5f43d0d3a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0878b9aa31429d23a93cd953cc6a2fc5f43d0d3a.hip deleted file mode 100644 index 85b9289f7132..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0878b9aa31429d23a93cd953cc6a2fc5f43d0d3a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089a347aef8a920e3b59d5ffe71fc5bfe002609c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089a347aef8a920e3b59d5ffe71fc5bfe002609c.hip deleted file mode 100644 index 7c6617552ed5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089a347aef8a920e3b59d5ffe71fc5bfe002609c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089de13222caec1483207d4a54249f8da4f9c151.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089de13222caec1483207d4a54249f8da4f9c151.hip deleted file mode 100644 index d214aec72e7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_089de13222caec1483207d4a54249f8da4f9c151.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_091cb49c1958fb4342d79f367ea93cf2b472f785.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_091cb49c1958fb4342d79f367ea93cf2b472f785.hip deleted file mode 100644 index 3e7612dcfb1b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_091cb49c1958fb4342d79f367ea93cf2b472f785.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_093834d4d3fe76e1745e4482c6b51b550c6f3dfc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_093834d4d3fe76e1745e4482c6b51b550c6f3dfc.hip deleted file mode 100644 index 1b65db905e3f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_093834d4d3fe76e1745e4482c6b51b550c6f3dfc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09513bff5c1da6aadf11d2e8272a422eabff21bc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09513bff5c1da6aadf11d2e8272a422eabff21bc.hip deleted file mode 100644 index d932430b1612..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09513bff5c1da6aadf11d2e8272a422eabff21bc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096863cd93d1b105a617d0daa1d4f37d7fb6b893.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096863cd93d1b105a617d0daa1d4f37d7fb6b893.hip deleted file mode 100644 index 99cf5eb1411f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096863cd93d1b105a617d0daa1d4f37d7fb6b893.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0968cebd81ade762c2f92fffc0153fa7a2b91eb5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0968cebd81ade762c2f92fffc0153fa7a2b91eb5.hip deleted file mode 100644 index 2df3d42336a3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0968cebd81ade762c2f92fffc0153fa7a2b91eb5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096e888c52d0f4a5847d7515fcc66208b1ff40d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096e888c52d0f4a5847d7515fcc66208b1ff40d3.hip deleted file mode 100644 index 579b87f23f40..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_096e888c52d0f4a5847d7515fcc66208b1ff40d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_097b3e1dae9bfb2e89398706508f8e01966fd4ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_097b3e1dae9bfb2e89398706508f8e01966fd4ea.hip deleted file mode 100644 index 0a105968b2d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_097b3e1dae9bfb2e89398706508f8e01966fd4ea.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09d76cca48b71dbcc9bd96734787209fee4c9a74.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09d76cca48b71dbcc9bd96734787209fee4c9a74.hip deleted file mode 100644 index 2382f2569520..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09d76cca48b71dbcc9bd96734787209fee4c9a74.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09e50367b62bb09071e28b44235a7c112645a706.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09e50367b62bb09071e28b44235a7c112645a706.hip deleted file mode 100644 index c6c9a21f84fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09e50367b62bb09071e28b44235a7c112645a706.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09ecb6347009f6a5d5530a6acf90f9f40288cbcf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09ecb6347009f6a5d5530a6acf90f9f40288cbcf.hip deleted file mode 100644 index fe5ee00e6d67..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_09ecb6347009f6a5d5530a6acf90f9f40288cbcf.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a2b116fd5065109aae46ee547e4f49ad0e9d6e1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a2b116fd5065109aae46ee547e4f49ad0e9d6e1.hip deleted file mode 100644 index b3b32d1d9e4f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a2b116fd5065109aae46ee547e4f49ad0e9d6e1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a4e76d89b175e1d9fd2e9fb908d5fce1ebb945d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a4e76d89b175e1d9fd2e9fb908d5fce1ebb945d.hip deleted file mode 100644 index da2e456243bd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a4e76d89b175e1d9fd2e9fb908d5fce1ebb945d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a55ed15ef58c941e06dda890aeb530e28eb7bba.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a55ed15ef58c941e06dda890aeb530e28eb7bba.hip deleted file mode 100644 index 758b29a93695..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a55ed15ef58c941e06dda890aeb530e28eb7bba.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a672fca51de618e3441cf8764e8e83eb782f2c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a672fca51de618e3441cf8764e8e83eb782f2c7.hip deleted file mode 100644 index c6fe88763aee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a672fca51de618e3441cf8764e8e83eb782f2c7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a68c2f9a3acdd787b81be455cbc7836c8bfd90c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a68c2f9a3acdd787b81be455cbc7836c8bfd90c.hip deleted file mode 100644 index e61f57695693..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a68c2f9a3acdd787b81be455cbc7836c8bfd90c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a89417a043556970f72eebd48b4f3e7ac15377a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a89417a043556970f72eebd48b4f3e7ac15377a.hip deleted file mode 100644 index 6ca2ca060c39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a89417a043556970f72eebd48b4f3e7ac15377a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a92671b6ea99891c0d69b1c793f4d131b9a82ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a92671b6ea99891c0d69b1c793f4d131b9a82ed.hip deleted file mode 100644 index 75ee19bea8d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0a92671b6ea99891c0d69b1c793f4d131b9a82ed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0aafb881e34a3794970a1282af740b3f19c138b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0aafb881e34a3794970a1282af740b3f19c138b1.hip deleted file mode 100644 index 6e28dab684c1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0aafb881e34a3794970a1282af740b3f19c138b1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ace6e29e1d3060c3086c08fe27b471e375f9c75.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ace6e29e1d3060c3086c08fe27b471e375f9c75.hip deleted file mode 100644 index 55f55e980f2d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ace6e29e1d3060c3086c08fe27b471e375f9c75.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ad9d68fcee021437e13ffdf94d78252205f5a31.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ad9d68fcee021437e13ffdf94d78252205f5a31.hip deleted file mode 100644 index fe4433b0d994..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ad9d68fcee021437e13ffdf94d78252205f5a31.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2647b5982405a48e8c8888552a4b89386ccdd9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2647b5982405a48e8c8888552a4b89386ccdd9.hip deleted file mode 100644 index 04bafdae15d7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2647b5982405a48e8c8888552a4b89386ccdd9.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2efefea81036641561bed80c75d77651176f74.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2efefea81036641561bed80c75d77651176f74.hip deleted file mode 100644 index 617f65d6a870..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b2efefea81036641561bed80c75d77651176f74.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b3153af7bcdba33115a0d31f121fd76be2ffbcc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b3153af7bcdba33115a0d31f121fd76be2ffbcc.hip deleted file mode 100644 index 8859071d0707..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b3153af7bcdba33115a0d31f121fd76be2ffbcc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b532fcf26f90c82a792cde7943634f667c1d033.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b532fcf26f90c82a792cde7943634f667c1d033.hip deleted file mode 100644 index fbec5b827fee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b532fcf26f90c82a792cde7943634f667c1d033.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b90a0186d8b8004e3f19886c7992c8e04d0e066.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b90a0186d8b8004e3f19886c7992c8e04d0e066.hip deleted file mode 100644 index fed24fa0e95e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b90a0186d8b8004e3f19886c7992c8e04d0e066.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b9585ba1c10acf67115c5899b3546608541820d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b9585ba1c10acf67115c5899b3546608541820d.hip deleted file mode 100644 index d615a130ca32..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0b9585ba1c10acf67115c5899b3546608541820d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bb81407c8a2b3cdc5fecf655b3ad64d5d729cc9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bb81407c8a2b3cdc5fecf655b3ad64d5d729cc9.hip deleted file mode 100644 index 5125929e3bea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bb81407c8a2b3cdc5fecf655b3ad64d5d729cc9.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bc7910aac798f0555e9e505ad7f177c9fbbd92c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bc7910aac798f0555e9e505ad7f177c9fbbd92c.hip deleted file mode 100644 index 64520c318a5d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0bc7910aac798f0555e9e505ad7f177c9fbbd92c.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0be8cf70c6be969ecfca675782c860b5b75ac089.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0be8cf70c6be969ecfca675782c860b5b75ac089.hip deleted file mode 100644 index e6477808a9b9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0be8cf70c6be969ecfca675782c860b5b75ac089.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0befed50a89d80c22b2c8c3d5ba67d73c3d0190e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0befed50a89d80c22b2c8c3d5ba67d73c3d0190e.hip deleted file mode 100644 index b6f0661741a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0befed50a89d80c22b2c8c3d5ba67d73c3d0190e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c32a2d9701e23dd930119c4ee8089042b5b0ac5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c32a2d9701e23dd930119c4ee8089042b5b0ac5.hip deleted file mode 100644 index 85b926d7b8bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c32a2d9701e23dd930119c4ee8089042b5b0ac5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c3b2ec99fa7b09c7f78dcc3142a661d686044ac.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c3b2ec99fa7b09c7f78dcc3142a661d686044ac.hip deleted file mode 100644 index bcc0e1185d46..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c3b2ec99fa7b09c7f78dcc3142a661d686044ac.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c8a0bb89a6f05289c0405df5126fa0cc16252e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c8a0bb89a6f05289c0405df5126fa0cc16252e7.hip deleted file mode 100644 index c573d72c183a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c8a0bb89a6f05289c0405df5126fa0cc16252e7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c93c65e5942a2f43f2e491547add02777dd2eee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c93c65e5942a2f43f2e491547add02777dd2eee.hip deleted file mode 100644 index 884925151454..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c93c65e5942a2f43f2e491547add02777dd2eee.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c9bd38b8f9009d932ec49204fdea39a52885246.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c9bd38b8f9009d932ec49204fdea39a52885246.hip deleted file mode 100644 index 112619152470..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0c9bd38b8f9009d932ec49204fdea39a52885246.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0caeedaa7d50f1741d618fb6c573529eebb075b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0caeedaa7d50f1741d618fb6c573529eebb075b1.hip deleted file mode 100644 index 95c691d24d3c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0caeedaa7d50f1741d618fb6c573529eebb075b1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cdef49859c80c6b3ba18eb2fb4c35c72abc1cf2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cdef49859c80c6b3ba18eb2fb4c35c72abc1cf2.hip deleted file mode 100644 index 331d0079fcd9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cdef49859c80c6b3ba18eb2fb4c35c72abc1cf2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cee6b9427c164d78994150305a47f73954a67c0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cee6b9427c164d78994150305a47f73954a67c0.hip deleted file mode 100644 index cd7110665afb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0cee6b9427c164d78994150305a47f73954a67c0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d0e0147a92061d32608a34e7b47bd534eb787fa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d0e0147a92061d32608a34e7b47bd534eb787fa.hip deleted file mode 100644 index 74aab67e6f82..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d0e0147a92061d32608a34e7b47bd534eb787fa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d13a4c8d169877da6408584dc1f20a6f7c5e3aa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d13a4c8d169877da6408584dc1f20a6f7c5e3aa.hip deleted file mode 100644 index 43fe4e0fdd16..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0d13a4c8d169877da6408584dc1f20a6f7c5e3aa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0dde401aa76cb5425563cbbdb0362748148da3ca.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0dde401aa76cb5425563cbbdb0362748148da3ca.hip deleted file mode 100644 index 56ed4317ebe9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0dde401aa76cb5425563cbbdb0362748148da3ca.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e007c36231ccdae12f102eacca1f74b0711b9c6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e007c36231ccdae12f102eacca1f74b0711b9c6.hip deleted file mode 100644 index a16ef08dff9f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e007c36231ccdae12f102eacca1f74b0711b9c6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e0a2370f2a320484d8f9f21e3197425c2dbe9ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e0a2370f2a320484d8f9f21e3197425c2dbe9ad.hip deleted file mode 100644 index 2de437aaeab2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e0a2370f2a320484d8f9f21e3197425c2dbe9ad.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e1dbc9c433ce8ec33ace9e62550261d613db582.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e1dbc9c433ce8ec33ace9e62550261d613db582.hip deleted file mode 100644 index 1961b436f0b3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e1dbc9c433ce8ec33ace9e62550261d613db582.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e3f4cd28a4c06cc109f6a0798a77844bcc750b7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e3f4cd28a4c06cc109f6a0798a77844bcc750b7.hip deleted file mode 100644 index 64e8dc795f95..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e3f4cd28a4c06cc109f6a0798a77844bcc750b7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e661b5f30566d1f159f060c264849c7ae4772f1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e661b5f30566d1f159f060c264849c7ae4772f1.hip deleted file mode 100644 index 321c06e27fcf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0e661b5f30566d1f159f060c264849c7ae4772f1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ebacd06455ab20eba78b389462946716b5819f6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ebacd06455ab20eba78b389462946716b5819f6.hip deleted file mode 100644 index 5041498b6447..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ebacd06455ab20eba78b389462946716b5819f6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef309b923172f4c0fb38d9b9f5325b33b4877c2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef309b923172f4c0fb38d9b9f5325b33b4877c2.hip deleted file mode 100644 index a86b86e858c1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef309b923172f4c0fb38d9b9f5325b33b4877c2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef9b9413697d6f4573c6605bff6f58d027c5016.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef9b9413697d6f4573c6605bff6f58d027c5016.hip deleted file mode 100644 index d717a3afff85..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0ef9b9413697d6f4573c6605bff6f58d027c5016.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0efdaa9266a5a464009297dc59db92504f8bf1a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0efdaa9266a5a464009297dc59db92504f8bf1a3.hip deleted file mode 100644 index 8a351a5edd3a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0efdaa9266a5a464009297dc59db92504f8bf1a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f0c699d9c3b0ed62097e38ba05e40e815cf474e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f0c699d9c3b0ed62097e38ba05e40e815cf474e.hip deleted file mode 100644 index 24ffbe00a191..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f0c699d9c3b0ed62097e38ba05e40e815cf474e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f588dcb2ef86677ebf84e406eb802e9921d1f1e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f588dcb2ef86677ebf84e406eb802e9921d1f1e.hip deleted file mode 100644 index 09e177b2d39e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0f588dcb2ef86677ebf84e406eb802e9921d1f1e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbb0bef3b388867e75d7a8a187b8b4b650a42ae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbb0bef3b388867e75d7a8a187b8b4b650a42ae.hip deleted file mode 100644 index 572d4bbd651b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbb0bef3b388867e75d7a8a187b8b4b650a42ae.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbddf533661642d84bf5a16149692d5a892182a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbddf533661642d84bf5a16149692d5a892182a.hip deleted file mode 100644 index ea5280dfceff..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fbddf533661642d84bf5a16149692d5a892182a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fcb7492feb79e27e0bda73e57ef7dab410e2bb6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fcb7492feb79e27e0bda73e57ef7dab410e2bb6.hip deleted file mode 100644 index b3db703d66ed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fcb7492feb79e27e0bda73e57ef7dab410e2bb6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fd4068ea93fcf4df463e3bf3a6898d23b65da7f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fd4068ea93fcf4df463e3bf3a6898d23b65da7f.hip deleted file mode 100644 index 3d721fee05b8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_0fd4068ea93fcf4df463e3bf3a6898d23b65da7f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_103186dbad604763008e0204a1ea90baecef8877.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_103186dbad604763008e0204a1ea90baecef8877.hip deleted file mode 100644 index cd6beecfbde1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_103186dbad604763008e0204a1ea90baecef8877.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1037f1bc50c4a65dac09ba56b701256b701c4322.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1037f1bc50c4a65dac09ba56b701256b701c4322.hip deleted file mode 100644 index dfb8780026c5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1037f1bc50c4a65dac09ba56b701256b701c4322.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10a055e5c3d6a953d470db5dc21449766248058a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10a055e5c3d6a953d470db5dc21449766248058a.hip deleted file mode 100644 index 23bf3d8a41aa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10a055e5c3d6a953d470db5dc21449766248058a.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10c24f1f9009e46afa3a59193784cc2575f79056.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10c24f1f9009e46afa3a59193784cc2575f79056.hip deleted file mode 100644 index f12904567d57..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10c24f1f9009e46afa3a59193784cc2575f79056.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10ceed95b0a0a01f844678717c88e0426fb503fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10ceed95b0a0a01f844678717c88e0426fb503fd.hip deleted file mode 100644 index 05cfb54d3ee4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_10ceed95b0a0a01f844678717c88e0426fb503fd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1132b11429034d96d82c82dbfdb69e460ad8a564.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1132b11429034d96d82c82dbfdb69e460ad8a564.hip deleted file mode 100644 index 7f9f7bb8ccde..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1132b11429034d96d82c82dbfdb69e460ad8a564.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11e7df31541c3aa919e9825ad7dc4432f9a03c0c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11e7df31541c3aa919e9825ad7dc4432f9a03c0c.hip deleted file mode 100644 index f40c23bce7e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11e7df31541c3aa919e9825ad7dc4432f9a03c0c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11ff174ff2175e9ec22ac3a0fa59dd7713b79643.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11ff174ff2175e9ec22ac3a0fa59dd7713b79643.hip deleted file mode 100644 index 29a1861d8aad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_11ff174ff2175e9ec22ac3a0fa59dd7713b79643.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1211733062ed30b876f1d63bffa642d77e258dd6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1211733062ed30b876f1d63bffa642d77e258dd6.hip deleted file mode 100644 index 3716bde6947d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1211733062ed30b876f1d63bffa642d77e258dd6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12207f4b6e7fac27d6c16493a5373f448a2aaae8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12207f4b6e7fac27d6c16493a5373f448a2aaae8.hip deleted file mode 100644 index e5220100370e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12207f4b6e7fac27d6c16493a5373f448a2aaae8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1241814f76107d74ed069ecec99a248676487eee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1241814f76107d74ed069ecec99a248676487eee.hip deleted file mode 100644 index dd665cc98c95..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1241814f76107d74ed069ecec99a248676487eee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d5c8a4988efe60ef7943ecd73e18a28a736583.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d5c8a4988efe60ef7943ecd73e18a28a736583.hip deleted file mode 100644 index 4607dd0a0afd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d5c8a4988efe60ef7943ecd73e18a28a736583.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d60c8abecb3bc9b84b0ea7851628ab17d8b0b3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d60c8abecb3bc9b84b0ea7851628ab17d8b0b3.hip deleted file mode 100644 index bc86062c2f2e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_12d60c8abecb3bc9b84b0ea7851628ab17d8b0b3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131691f01cc7f29affb88152dd48c7a484315dcd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131691f01cc7f29affb88152dd48c7a484315dcd.hip deleted file mode 100644 index 7a562779069f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131691f01cc7f29affb88152dd48c7a484315dcd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131c1fdc4206bb952b2fea675f24e3b09f605eef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131c1fdc4206bb952b2fea675f24e3b09f605eef.hip deleted file mode 100644 index b42587134ef0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_131c1fdc4206bb952b2fea675f24e3b09f605eef.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_133c51948cf8584900807998da14d788039f53b9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_133c51948cf8584900807998da14d788039f53b9.hip deleted file mode 100644 index 63e35572c92b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_133c51948cf8584900807998da14d788039f53b9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_135ea67de101135ed5fe04f5cab1ec1d7b3714bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_135ea67de101135ed5fe04f5cab1ec1d7b3714bb.hip deleted file mode 100644 index c071ea137ca2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_135ea67de101135ed5fe04f5cab1ec1d7b3714bb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_137fa6780d9e6bde10aec10a875c039fdbbc652e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_137fa6780d9e6bde10aec10a875c039fdbbc652e.hip deleted file mode 100644 index 679eb8d60896..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_137fa6780d9e6bde10aec10a875c039fdbbc652e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1386cd75411e61a8dbbaf2b916e62f4f5f99104f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1386cd75411e61a8dbbaf2b916e62f4f5f99104f.hip deleted file mode 100644 index e7b2fc5cf1fe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1386cd75411e61a8dbbaf2b916e62f4f5f99104f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13d5f2ec83b3331654e37ea0b44d88cd98abaa37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13d5f2ec83b3331654e37ea0b44d88cd98abaa37.hip deleted file mode 100644 index 86d734a77718..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13d5f2ec83b3331654e37ea0b44d88cd98abaa37.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13f747525ad31e76c88774fb2208e470da9c2310.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13f747525ad31e76c88774fb2208e470da9c2310.hip deleted file mode 100644 index 5b29c2e23a96..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_13f747525ad31e76c88774fb2208e470da9c2310.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14221590b90c48d3cf259fb4e834ccfaf7f3209b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14221590b90c48d3cf259fb4e834ccfaf7f3209b.hip deleted file mode 100644 index e554c56f71ff..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14221590b90c48d3cf259fb4e834ccfaf7f3209b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_144f19363ef26efd36f0436cfa9f84f181a8824c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_144f19363ef26efd36f0436cfa9f84f181a8824c.hip deleted file mode 100644 index 238d01d41536..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_144f19363ef26efd36f0436cfa9f84f181a8824c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_146eb8c40e3146e06936f3141b2c4d92a578ddec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_146eb8c40e3146e06936f3141b2c4d92a578ddec.hip deleted file mode 100644 index 42f40c02e6a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_146eb8c40e3146e06936f3141b2c4d92a578ddec.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14baaaf1e90a075ab802c6e7d97c4b1605c8bd72.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14baaaf1e90a075ab802c6e7d97c4b1605c8bd72.hip deleted file mode 100644 index 2a2e0ba23003..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14baaaf1e90a075ab802c6e7d97c4b1605c8bd72.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14c4ebd1792c781d219bd21b691b575f64635730.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14c4ebd1792c781d219bd21b691b575f64635730.hip deleted file mode 100644 index 68b15fe07b80..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14c4ebd1792c781d219bd21b691b575f64635730.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d11aad7b666f500f68b264a2fcca6dfc5f1a05.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d11aad7b666f500f68b264a2fcca6dfc5f1a05.hip deleted file mode 100644 index d6827fe611b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d11aad7b666f500f68b264a2fcca6dfc5f1a05.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d4630876785655bd4950566e81ae0b645c0d3c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d4630876785655bd4950566e81ae0b645c0d3c.hip deleted file mode 100644 index 968af70552b1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14d4630876785655bd4950566e81ae0b645c0d3c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14f77aeeafe4b28f314fde5ebccfd2a554872781.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14f77aeeafe4b28f314fde5ebccfd2a554872781.hip deleted file mode 100644 index 15a81f690ef9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14f77aeeafe4b28f314fde5ebccfd2a554872781.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14fea611f3c253aebf726af3e5fdb7e63e18e13a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14fea611f3c253aebf726af3e5fdb7e63e18e13a.hip deleted file mode 100644 index e9281c2470bc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_14fea611f3c253aebf726af3e5fdb7e63e18e13a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_151a4425b411596c46c7032f6b83d3152a0e0cd4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_151a4425b411596c46c7032f6b83d3152a0e0cd4.hip deleted file mode 100644 index a9090f2c4a6f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_151a4425b411596c46c7032f6b83d3152a0e0cd4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_153e897098539c3466da9d7a37234daf16476277.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_153e897098539c3466da9d7a37234daf16476277.hip deleted file mode 100644 index b034d2a816dd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_153e897098539c3466da9d7a37234daf16476277.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1552dc38d26f6badb7a9bcb5ce9124d54cc45ed3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1552dc38d26f6badb7a9bcb5ce9124d54cc45ed3.hip deleted file mode 100644 index 89ecc02f153e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1552dc38d26f6badb7a9bcb5ce9124d54cc45ed3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155bafb551768855c8c01faa63e44764ebe6c110.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155bafb551768855c8c01faa63e44764ebe6c110.hip deleted file mode 100644 index ae9c12940f1a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155bafb551768855c8c01faa63e44764ebe6c110.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155c3549d067464d186a99b8205317cc000d4898.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155c3549d067464d186a99b8205317cc000d4898.hip deleted file mode 100644 index 023d6d7a8fab..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_155c3549d067464d186a99b8205317cc000d4898.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1573e3d855d28c54af612ab950b081302891d56d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1573e3d855d28c54af612ab950b081302891d56d.hip deleted file mode 100644 index 73c97ded48fe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1573e3d855d28c54af612ab950b081302891d56d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157768cd725813f8111d265cfdfea7f42034e5e9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157768cd725813f8111d265cfdfea7f42034e5e9.hip deleted file mode 100644 index b21f5b7f8d1a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157768cd725813f8111d265cfdfea7f42034e5e9.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157b89d8d625b8244b5cceaa4d3e5fc5a09c8989.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157b89d8d625b8244b5cceaa4d3e5fc5a09c8989.hip deleted file mode 100644 index d1f406f622a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_157b89d8d625b8244b5cceaa4d3e5fc5a09c8989.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_158d5ce564c3ae1eefb54e3d41dde2604560ef4a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_158d5ce564c3ae1eefb54e3d41dde2604560ef4a.hip deleted file mode 100644 index e10ea61bbc20..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_158d5ce564c3ae1eefb54e3d41dde2604560ef4a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_159ee1f1b44d1a8fbaead65d8449413bb616d15e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_159ee1f1b44d1a8fbaead65d8449413bb616d15e.hip deleted file mode 100644 index d0948fa907a1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_159ee1f1b44d1a8fbaead65d8449413bb616d15e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15b255dde1a9d915e582ee2a83de7d83190c6a24.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15b255dde1a9d915e582ee2a83de7d83190c6a24.hip deleted file mode 100644 index fc63e594997e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15b255dde1a9d915e582ee2a83de7d83190c6a24.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15cf7068183421b141ed5d6e7fe902d06b6492a1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15cf7068183421b141ed5d6e7fe902d06b6492a1.hip deleted file mode 100644 index 430795d7cac1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15cf7068183421b141ed5d6e7fe902d06b6492a1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15dc02ea7e0908cf0bd48034f5a49debfaa36219.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15dc02ea7e0908cf0bd48034f5a49debfaa36219.hip deleted file mode 100644 index 2c15174057a2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15dc02ea7e0908cf0bd48034f5a49debfaa36219.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15e8e1ab8c63db96843054bb7a98d708ae6a9c44.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15e8e1ab8c63db96843054bb7a98d708ae6a9c44.hip deleted file mode 100644 index 3be74e604c10..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15e8e1ab8c63db96843054bb7a98d708ae6a9c44.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15fe3e8f4add16a088fe44458353fa7c0c4f9658.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15fe3e8f4add16a088fe44458353fa7c0c4f9658.hip deleted file mode 100644 index 29e5a6187071..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_15fe3e8f4add16a088fe44458353fa7c0c4f9658.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16047b5544acef40e39932672cac6f562e200948.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16047b5544acef40e39932672cac6f562e200948.hip deleted file mode 100644 index 3b7a1342e0e9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16047b5544acef40e39932672cac6f562e200948.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1621507cf219fe608715d4e5bb6e5764022e2d61.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1621507cf219fe608715d4e5bb6e5764022e2d61.hip deleted file mode 100644 index fa589b3c519e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1621507cf219fe608715d4e5bb6e5764022e2d61.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_162b0dfbe3f615b1d164290799b2457437a0044b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_162b0dfbe3f615b1d164290799b2457437a0044b.hip deleted file mode 100644 index cc4ef8e9d271..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_162b0dfbe3f615b1d164290799b2457437a0044b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_164a947a6c2ba83a5b1cb7074aee0bdac6c9c64e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_164a947a6c2ba83a5b1cb7074aee0bdac6c9c64e.hip deleted file mode 100644 index f2a1616785ce..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_164a947a6c2ba83a5b1cb7074aee0bdac6c9c64e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_165dfb45658df8f1ae8dc0738ac9614740f2576c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_165dfb45658df8f1ae8dc0738ac9614740f2576c.hip deleted file mode 100644 index 8cc1759f9df0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_165dfb45658df8f1ae8dc0738ac9614740f2576c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_167f5328b035ed59a6f05dfee31edd704c4b07ee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_167f5328b035ed59a6f05dfee31edd704c4b07ee.hip deleted file mode 100644 index b68e24323a1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_167f5328b035ed59a6f05dfee31edd704c4b07ee.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1687ddf65ce4ed2997583e20fee9f201e86633b3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1687ddf65ce4ed2997583e20fee9f201e86633b3.hip deleted file mode 100644 index 573dea9236ec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1687ddf65ce4ed2997583e20fee9f201e86633b3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16f94f5c65c37624f5458c165daf83517d9e3c81.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16f94f5c65c37624f5458c165daf83517d9e3c81.hip deleted file mode 100644 index 715c37a9bf39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_16f94f5c65c37624f5458c165daf83517d9e3c81.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_173c44dd85077e6b12dd06fdcf6b11ba349e1866.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_173c44dd85077e6b12dd06fdcf6b11ba349e1866.hip deleted file mode 100644 index 5721349d162a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_173c44dd85077e6b12dd06fdcf6b11ba349e1866.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_17b9b96edda151072215502cc2b606bf1f6f0b03.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_17b9b96edda151072215502cc2b606bf1f6f0b03.hip deleted file mode 100644 index 2fe640db9a10..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_17b9b96edda151072215502cc2b606bf1f6f0b03.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1847fef2c06ea581b0ab31af1cb0556c572696ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1847fef2c06ea581b0ab31af1cb0556c572696ad.hip deleted file mode 100644 index 3ef88ce2cc06..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1847fef2c06ea581b0ab31af1cb0556c572696ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_187963e1969301abfa61d06afc97faea2bb4efb1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_187963e1969301abfa61d06afc97faea2bb4efb1.hip deleted file mode 100644 index 2bc002ad61c1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_187963e1969301abfa61d06afc97faea2bb4efb1.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1886d4bf54b3a4a9e093360998b2059b3c03d072.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1886d4bf54b3a4a9e093360998b2059b3c03d072.hip deleted file mode 100644 index f46020c36692..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1886d4bf54b3a4a9e093360998b2059b3c03d072.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_188a70d526394e254274df95de0727850820326c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_188a70d526394e254274df95de0727850820326c.hip deleted file mode 100644 index a35b92ebd3aa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_188a70d526394e254274df95de0727850820326c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1899e28aff2fb168cdc3af7132dd7fd09c2e1ced.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1899e28aff2fb168cdc3af7132dd7fd09c2e1ced.hip deleted file mode 100644 index e4fe0ad72f1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1899e28aff2fb168cdc3af7132dd7fd09c2e1ced.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18a4d71b31c451a50df7996e3db864bc3c3882ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18a4d71b31c451a50df7996e3db864bc3c3882ed.hip deleted file mode 100644 index a74f83d2c6cb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18a4d71b31c451a50df7996e3db864bc3c3882ed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18b92b4e249195ac3e0c74d246585a4c9e0992fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18b92b4e249195ac3e0c74d246585a4c9e0992fd.hip deleted file mode 100644 index f86e45c6e75f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18b92b4e249195ac3e0c74d246585a4c9e0992fd.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18ed7195a9443c84956c3f32839cb3ab9056bdfc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18ed7195a9443c84956c3f32839cb3ab9056bdfc.hip deleted file mode 100644 index 482ef07d6a85..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_18ed7195a9443c84956c3f32839cb3ab9056bdfc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1914250fce818584291c69a5f058a58cfbd83df9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1914250fce818584291c69a5f058a58cfbd83df9.hip deleted file mode 100644 index 6d626566787a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1914250fce818584291c69a5f058a58cfbd83df9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_193699a5daa14ca2def07489e0b563149bc403f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_193699a5daa14ca2def07489e0b563149bc403f8.hip deleted file mode 100644 index 04fcedfa0e6c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_193699a5daa14ca2def07489e0b563149bc403f8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19af6a7f9e5020e8d0f0ca0f6258001f6ce592c1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19af6a7f9e5020e8d0f0ca0f6258001f6ce592c1.hip deleted file mode 100644 index 14a181fa2ca5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19af6a7f9e5020e8d0f0ca0f6258001f6ce592c1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19cd9f7b08cec83736605af63d9fcaf463a1aea4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19cd9f7b08cec83736605af63d9fcaf463a1aea4.hip deleted file mode 100644 index 35f450cf73e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19cd9f7b08cec83736605af63d9fcaf463a1aea4.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19df4e13108e043361e9528b71df56f04f696a0c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19df4e13108e043361e9528b71df56f04f696a0c.hip deleted file mode 100644 index bd2b484ab1b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_19df4e13108e043361e9528b71df56f04f696a0c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a11dd5ebb989503a1c182684e7f247e2f8cd9c2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a11dd5ebb989503a1c182684e7f247e2f8cd9c2.hip deleted file mode 100644 index 9298ada0dcaf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a11dd5ebb989503a1c182684e7f247e2f8cd9c2.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a236be9da05a07d11cd28034d90cdf89941a172.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a236be9da05a07d11cd28034d90cdf89941a172.hip deleted file mode 100644 index 025ff51c4bb5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a236be9da05a07d11cd28034d90cdf89941a172.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a5e18f6333ed2cce509f07cb8bd5868951d66a0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a5e18f6333ed2cce509f07cb8bd5868951d66a0.hip deleted file mode 100644 index 01fe775956b9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a5e18f6333ed2cce509f07cb8bd5868951d66a0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6785392af35e27d6697b584cb6f17a766d3fee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6785392af35e27d6697b584cb6f17a766d3fee.hip deleted file mode 100644 index b8076aeff347..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6785392af35e27d6697b584cb6f17a766d3fee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6bc2762b95d550485aa720edaf71138d94cd07.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6bc2762b95d550485aa720edaf71138d94cd07.hip deleted file mode 100644 index 95adf9fe618e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a6bc2762b95d550485aa720edaf71138d94cd07.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a8da3e6ab050262b659c801ccf9a14787d7f176.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a8da3e6ab050262b659c801ccf9a14787d7f176.hip deleted file mode 100644 index 4a426d3b44bd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a8da3e6ab050262b659c801ccf9a14787d7f176.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a96f0ac76f117e66eba97cb990c2350561ec2ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a96f0ac76f117e66eba97cb990c2350561ec2ab.hip deleted file mode 100644 index 47831996f5d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a96f0ac76f117e66eba97cb990c2350561ec2ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a98bcbe900f8c141136d18c114b02fffbe8bca1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a98bcbe900f8c141136d18c114b02fffbe8bca1.hip deleted file mode 100644 index c7f2e83abcb7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a98bcbe900f8c141136d18c114b02fffbe8bca1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a99b2625adffa8215276bb88fc65bae944b846b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a99b2625adffa8215276bb88fc65bae944b846b.hip deleted file mode 100644 index 181a3ee0a95e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1a99b2625adffa8215276bb88fc65bae944b846b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1acf2f892742b1d236d2b31a8185c6869126adad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1acf2f892742b1d236d2b31a8185c6869126adad.hip deleted file mode 100644 index cf9f36d0a048..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1acf2f892742b1d236d2b31a8185c6869126adad.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1b3e7c8969027d3316875f33dc50fe022e05ce37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1b3e7c8969027d3316875f33dc50fe022e05ce37.hip deleted file mode 100644 index 6d7d63c8f166..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1b3e7c8969027d3316875f33dc50fe022e05ce37.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be43f8b629e7039f57b95866d5777273377470d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be43f8b629e7039f57b95866d5777273377470d.hip deleted file mode 100644 index 47801621a594..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be43f8b629e7039f57b95866d5777273377470d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be746990a2032f0363ad9f9112cc994983f4706.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be746990a2032f0363ad9f9112cc994983f4706.hip deleted file mode 100644 index 248b8f51f98d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1be746990a2032f0363ad9f9112cc994983f4706.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1bf767e7104cfc8322f26df35907fbf04b8948f3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1bf767e7104cfc8322f26df35907fbf04b8948f3.hip deleted file mode 100644 index 0d1b800a323c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1bf767e7104cfc8322f26df35907fbf04b8948f3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c1b0f85e085dd0769c566fb16aafe5ab5952714.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c1b0f85e085dd0769c566fb16aafe5ab5952714.hip deleted file mode 100644 index 243f6df90cf8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c1b0f85e085dd0769c566fb16aafe5ab5952714.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c2a2d78176e3f0a78e3ad78217e75a4430c0de5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c2a2d78176e3f0a78e3ad78217e75a4430c0de5.hip deleted file mode 100644 index c47609b9a2b3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c2a2d78176e3f0a78e3ad78217e75a4430c0de5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c65ba6dba01da9caa84ba89453b61d81376763f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c65ba6dba01da9caa84ba89453b61d81376763f.hip deleted file mode 100644 index 95217f517ba5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1c65ba6dba01da9caa84ba89453b61d81376763f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1ca3f45d0be2d1119cccd0af042a3e8adeda2ed7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1ca3f45d0be2d1119cccd0af042a3e8adeda2ed7.hip deleted file mode 100644 index 30608a330e5a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1ca3f45d0be2d1119cccd0af042a3e8adeda2ed7.hip +++ /dev/null @@ -1,1965 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config& s){ - float r = -1; - if(t.data_type.compare("fp16") == 0){ - if (t.hdim_q <= 32 && t.hdim_v <= 32) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::fp16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - else if (t.hdim_q <= 64 && t.hdim_v <= 64) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::fp16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - else if (t.hdim_q <= 128 && t.hdim_v <= 128) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::fp16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - else if (t.hdim_q <= 256 && t.hdim_v <= 256) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::fp16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - - } - else if(t.data_type.compare("bf16") == 0){ - if (t.hdim_q <= 32 && t.hdim_v <= 32) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, false, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<32, ck_tile::bf16_t, true, 128, 64, 16, 32, 32, 32, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - else if (t.hdim_q <= 64 && t.hdim_v <= 64) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 64 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 64 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, false, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<64, ck_tile::bf16_t, true, 128, 64, 32, 64, 32, 64, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - else if (t.hdim_q <= 128 && t.hdim_v <= 128) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k != 0 && a.seqlen_k % 128 == 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true) && (a.seqlen_k == 0 || a.seqlen_k % 128 != 0) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, false, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (a.hdim_q % 8 == 0) && (a.hdim_v % 8 == 0)) { - using trait_ = fmha_fwd_traits_<128, ck_tile::bf16_t, true, 128, 128, 32, 128, 32, 128, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - else if (t.hdim_q <= 256 && t.hdim_v <= 256) { - if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (a.seqlen_q % 128 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == false) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true /*a.seqlen_q % 128 != 0*/) && (true /*a.seqlen_k % 128 != 0*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, false, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == true) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == true) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - else if((t.is_group_mode == true) && (t.is_v_rowmajor == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_lse == false) && (t.has_dropout == false) && (t.do_fp8_static_quant == false) && - (true/*group mode spad always true*/) && (true/*group mode skpad always true*/) && (true /*a.hdim_q % 256 != 0*/) && (true /*a.hdim_v % 256 != 0*/)) { - using trait_ = fmha_fwd_traits_<256, ck_tile::bf16_t, true, 128, 128, 32, 256, 32, 256, true, ck_tile::BlockFmhaPipelineEnum::QRKSVS, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - return fmha_fwd_(s, a); - } - - } - - } - - return r; -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cbf88db44aa5f884438288a325270d29c7a04b6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cbf88db44aa5f884438288a325270d29c7a04b6.hip deleted file mode 100644 index 73391c712265..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cbf88db44aa5f884438288a325270d29c7a04b6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cc459e57bfed5ec7f40ea4a4dd9f72f3ad7a709.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cc459e57bfed5ec7f40ea4a4dd9f72f3ad7a709.hip deleted file mode 100644 index f482fd18d2a1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1cc459e57bfed5ec7f40ea4a4dd9f72f3ad7a709.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d02609fb803ea2697e2c2cef35e6f923d2578cf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d02609fb803ea2697e2c2cef35e6f923d2578cf.hip deleted file mode 100644 index 4e5b46371b7c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d02609fb803ea2697e2c2cef35e6f923d2578cf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d0b822743e0205f60521d38d7c64f589fdf0f58.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d0b822743e0205f60521d38d7c64f589fdf0f58.hip deleted file mode 100644 index 836aa4048ee2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d0b822743e0205f60521d38d7c64f589fdf0f58.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d21263e16dafe79b9fe2f998847296e575c14e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d21263e16dafe79b9fe2f998847296e575c14e7.hip deleted file mode 100644 index fb779faa4a2d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d21263e16dafe79b9fe2f998847296e575c14e7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d3ef3d5ded0dfe2a0bafb52ea8f841658db35fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d3ef3d5ded0dfe2a0bafb52ea8f841658db35fd.hip deleted file mode 100644 index 94eff94b23c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d3ef3d5ded0dfe2a0bafb52ea8f841658db35fd.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d498e418ebbf33bed58b4074d1edf3d9bdd07c5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d498e418ebbf33bed58b4074d1edf3d9bdd07c5.hip deleted file mode 100644 index 5a57720770aa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1d498e418ebbf33bed58b4074d1edf3d9bdd07c5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1da23de9604b5d98fe02529075bad995954c12ca.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1da23de9604b5d98fe02529075bad995954c12ca.hip deleted file mode 100644 index e9f2586443c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1da23de9604b5d98fe02529075bad995954c12ca.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1db03461737f1e359f389a8d297476f9b60faabd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1db03461737f1e359f389a8d297476f9b60faabd.hip deleted file mode 100644 index a6baa018aff3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1db03461737f1e359f389a8d297476f9b60faabd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1dc6e599144a093203fd7f92ac6d3c2cd7180d49.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1dc6e599144a093203fd7f92ac6d3c2cd7180d49.hip deleted file mode 100644 index cf3d8876d621..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1dc6e599144a093203fd7f92ac6d3c2cd7180d49.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1de2f97d49f015b9af0b186801e939c6f357a0c4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1de2f97d49f015b9af0b186801e939c6f357a0c4.hip deleted file mode 100644 index 6925977e633f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1de2f97d49f015b9af0b186801e939c6f357a0c4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1df893ee660d37fba7eaca452ae65b3e45a73087.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1df893ee660d37fba7eaca452ae65b3e45a73087.hip deleted file mode 100644 index ceefbd5e8fcb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1df893ee660d37fba7eaca452ae65b3e45a73087.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e22f2d99804198c61251b4629a3f18ed3dcd42e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e22f2d99804198c61251b4629a3f18ed3dcd42e.hip deleted file mode 100644 index c6b542451105..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e22f2d99804198c61251b4629a3f18ed3dcd42e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e33ce1fa113b221e5303b4093c2c4e748ce8298.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e33ce1fa113b221e5303b4093c2c4e748ce8298.hip deleted file mode 100644 index 4a2cdbc008b8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e33ce1fa113b221e5303b4093c2c4e748ce8298.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e42736d4f677a59a172bd6f162616a437696351.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e42736d4f677a59a172bd6f162616a437696351.hip deleted file mode 100644 index 5c478d32afe5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e42736d4f677a59a172bd6f162616a437696351.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e7d7888480b83c78833214b32e10f37a6e20301.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e7d7888480b83c78833214b32e10f37a6e20301.hip deleted file mode 100644 index 60f771fac945..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e7d7888480b83c78833214b32e10f37a6e20301.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e9130607a2d24cb0662a47e9cf12c6602143838.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e9130607a2d24cb0662a47e9cf12c6602143838.hip deleted file mode 100644 index d66b60ea5791..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e9130607a2d24cb0662a47e9cf12c6602143838.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e943fcc2e64c618fc1415b3f1a0db4d70aa8494.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e943fcc2e64c618fc1415b3f1a0db4d70aa8494.hip deleted file mode 100644 index 9164d233343e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1e943fcc2e64c618fc1415b3f1a0db4d70aa8494.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1edaf9d4270d2ac61c299320e06ba73f44730364.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1edaf9d4270d2ac61c299320e06ba73f44730364.hip deleted file mode 100644 index 89688d0ff5e8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1edaf9d4270d2ac61c299320e06ba73f44730364.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f0cad6ad5b172e51c569e84cd54a19b4eb0ed05.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f0cad6ad5b172e51c569e84cd54a19b4eb0ed05.hip deleted file mode 100644 index df15d8e39bce..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f0cad6ad5b172e51c569e84cd54a19b4eb0ed05.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f13a6d0f8c798c0c4ba4ad202d081899fe081ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f13a6d0f8c798c0c4ba4ad202d081899fe081ab.hip deleted file mode 100644 index c0e8df1f9478..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f13a6d0f8c798c0c4ba4ad202d081899fe081ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f6bc5faf18be193212217788d476ce6fd384bfb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f6bc5faf18be193212217788d476ce6fd384bfb.hip deleted file mode 100644 index 93513afa554e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f6bc5faf18be193212217788d476ce6fd384bfb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f7faa0b33a9aada86f032174afd40d18efa7715.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f7faa0b33a9aada86f032174afd40d18efa7715.hip deleted file mode 100644 index 6b6a858e06dd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f7faa0b33a9aada86f032174afd40d18efa7715.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f81f8cce0d77dec9f977b9eeb0778b70a13fa75.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f81f8cce0d77dec9f977b9eeb0778b70a13fa75.hip deleted file mode 100644 index 9e1dcb2e6f89..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1f81f8cce0d77dec9f977b9eeb0778b70a13fa75.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fcdcb750f382fc7828a9886585f50efbe5be735.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fcdcb750f382fc7828a9886585f50efbe5be735.hip deleted file mode 100644 index aea7ec22eca1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fcdcb750f382fc7828a9886585f50efbe5be735.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fd9fa7c2e13d0bad5fddb2b5a316bbc09d397ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fd9fa7c2e13d0bad5fddb2b5a316bbc09d397ea.hip deleted file mode 100644 index 334483fc2439..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fd9fa7c2e13d0bad5fddb2b5a316bbc09d397ea.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fda1c96568eab89a8f6498f8bb23c1223cdc7b0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fda1c96568eab89a8f6498f8bb23c1223cdc7b0.hip deleted file mode 100644 index efa3ae279800..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_1fda1c96568eab89a8f6498f8bb23c1223cdc7b0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2005aca3520b171bb82d10ad70fef44f28c19776.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2005aca3520b171bb82d10ad70fef44f28c19776.hip deleted file mode 100644 index 8041b74da852..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2005aca3520b171bb82d10ad70fef44f28c19776.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_204a573ce6b7d2f90aede543939315561cc43177.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_204a573ce6b7d2f90aede543939315561cc43177.hip deleted file mode 100644 index 09c17b863cd4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_204a573ce6b7d2f90aede543939315561cc43177.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20588bcac681a5d69f252d7523a3681a0c6b6181.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20588bcac681a5d69f252d7523a3681a0c6b6181.hip deleted file mode 100644 index 865812b92a15..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20588bcac681a5d69f252d7523a3681a0c6b6181.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2081430c92864c29bb9f409e7c27caee1de00749.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2081430c92864c29bb9f409e7c27caee1de00749.hip deleted file mode 100644 index f963a04e71b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2081430c92864c29bb9f409e7c27caee1de00749.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20d5c3c86398f6ce55abc90db3e362dbf9f457f2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20d5c3c86398f6ce55abc90db3e362dbf9f457f2.hip deleted file mode 100644 index c332023863c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20d5c3c86398f6ce55abc90db3e362dbf9f457f2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20f7ea0aabd069362ba4bbd66623cea5b6e1a6bd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20f7ea0aabd069362ba4bbd66623cea5b6e1a6bd.hip deleted file mode 100644 index b8235acd7300..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_20f7ea0aabd069362ba4bbd66623cea5b6e1a6bd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_210ef512b7862837f54acbc3b21e135a192647a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_210ef512b7862837f54acbc3b21e135a192647a3.hip deleted file mode 100644 index 37fcd87b8fda..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_210ef512b7862837f54acbc3b21e135a192647a3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2122c973581930ab7a4ebc90b3bf1cdaa229a87f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2122c973581930ab7a4ebc90b3bf1cdaa229a87f.hip deleted file mode 100644 index d88ddc61be7c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2122c973581930ab7a4ebc90b3bf1cdaa229a87f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21411df58165946bf02942b597d94de7dd856987.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21411df58165946bf02942b597d94de7dd856987.hip deleted file mode 100644 index 671921ec00fe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21411df58165946bf02942b597d94de7dd856987.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_216806a4598c885e517e664fc8280c59ec3cbf11.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_216806a4598c885e517e664fc8280c59ec3cbf11.hip deleted file mode 100644 index 91b2de03324e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_216806a4598c885e517e664fc8280c59ec3cbf11.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2173b7c710d418f44dc2b41bec5905024334eae5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2173b7c710d418f44dc2b41bec5905024334eae5.hip deleted file mode 100644 index 5fe4ecad692c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2173b7c710d418f44dc2b41bec5905024334eae5.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2177d95cdf45f6fec95d1812f2ef183a75259e38.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2177d95cdf45f6fec95d1812f2ef183a75259e38.hip deleted file mode 100644 index 8f2a0a6e6eaa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2177d95cdf45f6fec95d1812f2ef183a75259e38.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21828c7d3f5574690f12f841c27f025206e6165b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21828c7d3f5574690f12f841c27f025206e6165b.hip deleted file mode 100644 index bb6073dc6fc9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21828c7d3f5574690f12f841c27f025206e6165b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2184fba2eec5899bb40d49d4508196e6be1ec1b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2184fba2eec5899bb40d49d4508196e6be1ec1b1.hip deleted file mode 100644 index 58fc638b835c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2184fba2eec5899bb40d49d4508196e6be1ec1b1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21e235e31d6955393ac8e825bd69ead70687b7c8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21e235e31d6955393ac8e825bd69ead70687b7c8.hip deleted file mode 100644 index be18058190aa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21e235e31d6955393ac8e825bd69ead70687b7c8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21f860d42fdc2cc6bd743d53ba546e332c22fedf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21f860d42fdc2cc6bd743d53ba546e332c22fedf.hip deleted file mode 100644 index 27699ac75a73..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_21f860d42fdc2cc6bd743d53ba546e332c22fedf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22105635385fbfb5d2f330df83ba6747bcb27f6d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22105635385fbfb5d2f330df83ba6747bcb27f6d.hip deleted file mode 100644 index b894a81ea9f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22105635385fbfb5d2f330df83ba6747bcb27f6d.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_224f9af5e5ca519b21b71a54acb49f50b4999c47.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_224f9af5e5ca519b21b71a54acb49f50b4999c47.hip deleted file mode 100644 index 5bd291bb9f84..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_224f9af5e5ca519b21b71a54acb49f50b4999c47.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22511de2592b6e350737e44865e1fed6496e3f32.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22511de2592b6e350737e44865e1fed6496e3f32.hip deleted file mode 100644 index 83689a58fa0a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22511de2592b6e350737e44865e1fed6496e3f32.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22632f996eb63fbe4bc5748c5897b775087446a0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22632f996eb63fbe4bc5748c5897b775087446a0.hip deleted file mode 100644 index 8c312dca66c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22632f996eb63fbe4bc5748c5897b775087446a0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_226662cf1c9900a4334d2cadcc5f5ac3ad355f05.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_226662cf1c9900a4334d2cadcc5f5ac3ad355f05.hip deleted file mode 100644 index 9003c098b391..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_226662cf1c9900a4334d2cadcc5f5ac3ad355f05.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2273457ac3be01cc1595a015a5f598f8290c77e4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2273457ac3be01cc1595a015a5f598f8290c77e4.hip deleted file mode 100644 index e11c6d0aca7c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2273457ac3be01cc1595a015a5f598f8290c77e4.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22a07ecf1a59f72ec6bef3e970d7f33cf54c5f44.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22a07ecf1a59f72ec6bef3e970d7f33cf54c5f44.hip deleted file mode 100644 index 1d8b60736452..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22a07ecf1a59f72ec6bef3e970d7f33cf54c5f44.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22c142d869ef940ca876c93033ad53b576ed34f2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22c142d869ef940ca876c93033ad53b576ed34f2.hip deleted file mode 100644 index ee6ff72badae..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_22c142d869ef940ca876c93033ad53b576ed34f2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23047ea90076e3b0a3eb0586d49b9ee74ca6d279.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23047ea90076e3b0a3eb0586d49b9ee74ca6d279.hip deleted file mode 100644 index b7c815e3791b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23047ea90076e3b0a3eb0586d49b9ee74ca6d279.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_230861e81e5acc523fa680534eed757b7b4a4e1d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_230861e81e5acc523fa680534eed757b7b4a4e1d.hip deleted file mode 100644 index afd118c1bbdf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_230861e81e5acc523fa680534eed757b7b4a4e1d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_232f61bf31dbb5de5d7039d5ff2338068a759b68.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_232f61bf31dbb5de5d7039d5ff2338068a759b68.hip deleted file mode 100644 index b4eef334b9ae..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_232f61bf31dbb5de5d7039d5ff2338068a759b68.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_233132e712eba8972ba444c604f89e01c5b84cc0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_233132e712eba8972ba444c604f89e01c5b84cc0.hip deleted file mode 100644 index 5ce878e29280..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_233132e712eba8972ba444c604f89e01c5b84cc0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_235bf652702c2976551778b9159e09188575c63c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_235bf652702c2976551778b9159e09188575c63c.hip deleted file mode 100644 index 5c69cd3a6eeb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_235bf652702c2976551778b9159e09188575c63c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_236b3eef02b904304348b9d35f715b639d63218f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_236b3eef02b904304348b9d35f715b639d63218f.hip deleted file mode 100644 index 16af5ff2193e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_236b3eef02b904304348b9d35f715b639d63218f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_238e4c1ca112afec494fbe47a85b553302c43395.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_238e4c1ca112afec494fbe47a85b553302c43395.hip deleted file mode 100644 index 136c6cfa2e4a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_238e4c1ca112afec494fbe47a85b553302c43395.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23914c00690ac5c4f89cdbbaf00732ba66c5c0ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23914c00690ac5c4f89cdbbaf00732ba66c5c0ef.hip deleted file mode 100644 index e3dd044de7d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23914c00690ac5c4f89cdbbaf00732ba66c5c0ef.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23c9b46da8774462de8c24e14b12df3ed596eb57.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23c9b46da8774462de8c24e14b12df3ed596eb57.hip deleted file mode 100644 index 4eaff6e7b550..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_23c9b46da8774462de8c24e14b12df3ed596eb57.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_242013527a0266ad479715ee3e6ae01c45de29d0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_242013527a0266ad479715ee3e6ae01c45de29d0.hip deleted file mode 100644 index 5fbd1baa0860..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_242013527a0266ad479715ee3e6ae01c45de29d0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24410fd9a4150c33186a2a365d06d8f6ea621c20.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24410fd9a4150c33186a2a365d06d8f6ea621c20.hip deleted file mode 100644 index 77b4c8633836..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24410fd9a4150c33186a2a365d06d8f6ea621c20.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_245d90000b55ab8b6055b1934880fc6c4870b34b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_245d90000b55ab8b6055b1934880fc6c4870b34b.hip deleted file mode 100644 index 28691cb2b5c1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_245d90000b55ab8b6055b1934880fc6c4870b34b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24643917fc970c043d1c80d8d4b17ec92deeb8a1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24643917fc970c043d1c80d8d4b17ec92deeb8a1.hip deleted file mode 100644 index ca3e3d45b7ae..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_24643917fc970c043d1c80d8d4b17ec92deeb8a1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249668a3212cd00edaae871758be30a5a1fea589.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249668a3212cd00edaae871758be30a5a1fea589.hip deleted file mode 100644 index c9634cbab81c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249668a3212cd00edaae871758be30a5a1fea589.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249e6b93baae25dff97a0bc9145a8d328ed3f317.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249e6b93baae25dff97a0bc9145a8d328ed3f317.hip deleted file mode 100644 index 20d06011bc1b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_249e6b93baae25dff97a0bc9145a8d328ed3f317.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2543da478310245e19e6c6a0d9ed7ad99540b3bc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2543da478310245e19e6c6a0d9ed7ad99540b3bc.hip deleted file mode 100644 index 8a21b5c90def..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2543da478310245e19e6c6a0d9ed7ad99540b3bc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_256ef175029a43e64164176d4eb212baf9d27bb9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_256ef175029a43e64164176d4eb212baf9d27bb9.hip deleted file mode 100644 index 7480ed6987e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_256ef175029a43e64164176d4eb212baf9d27bb9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_258d747083272ea657604ac84867ecea17bd65da.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_258d747083272ea657604ac84867ecea17bd65da.hip deleted file mode 100644 index 5e57c6c6d3ee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_258d747083272ea657604ac84867ecea17bd65da.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25938733446b6c0dcd159719f08d04a9aa467967.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25938733446b6c0dcd159719f08d04a9aa467967.hip deleted file mode 100644 index 783157a28e5a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25938733446b6c0dcd159719f08d04a9aa467967.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25b3225da1e1842f83592971a1f62a0fe30aa9d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25b3225da1e1842f83592971a1f62a0fe30aa9d3.hip deleted file mode 100644 index 855839f90bd3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_25b3225da1e1842f83592971a1f62a0fe30aa9d3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2660282ad39ef034fecbdb74acedfb48620b7dfd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2660282ad39ef034fecbdb74acedfb48620b7dfd.hip deleted file mode 100644 index b147553021e0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2660282ad39ef034fecbdb74acedfb48620b7dfd.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26835ba70606c769e56d19dbfe74061361aa855e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26835ba70606c769e56d19dbfe74061361aa855e.hip deleted file mode 100644 index a43a8c06c907..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26835ba70606c769e56d19dbfe74061361aa855e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2695783ae8f0034692efd6563f789ef03fd0f4f3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2695783ae8f0034692efd6563f789ef03fd0f4f3.hip deleted file mode 100644 index e83e4ad89fa8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2695783ae8f0034692efd6563f789ef03fd0f4f3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26d77b228420a3ead919474ec9c6fb2800f86890.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26d77b228420a3ead919474ec9c6fb2800f86890.hip deleted file mode 100644 index 43d564ac04a5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26d77b228420a3ead919474ec9c6fb2800f86890.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26ea90eb5a527434c1740933a1d2dd863eccf14c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26ea90eb5a527434c1740933a1d2dd863eccf14c.hip deleted file mode 100644 index d3c3953617b9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26ea90eb5a527434c1740933a1d2dd863eccf14c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26f90358e522d7bb7c76c3a2c6010f0f38788bb6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26f90358e522d7bb7c76c3a2c6010f0f38788bb6.hip deleted file mode 100644 index a2b6a74abf63..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_26f90358e522d7bb7c76c3a2c6010f0f38788bb6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2703018e71d57d3266fc35e2e18a78faa3dd52ce.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2703018e71d57d3266fc35e2e18a78faa3dd52ce.hip deleted file mode 100644 index c14490fad299..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2703018e71d57d3266fc35e2e18a78faa3dd52ce.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_278639d44a4a8372a627a7c31e9527c8faa26f97.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_278639d44a4a8372a627a7c31e9527c8faa26f97.hip deleted file mode 100644 index 2a5ec4564ac3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_278639d44a4a8372a627a7c31e9527c8faa26f97.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_27c2000d32c230a57a6712f27bc0fba02722f5fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_27c2000d32c230a57a6712f27bc0fba02722f5fd.hip deleted file mode 100644 index db00c82b5dd0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_27c2000d32c230a57a6712f27bc0fba02722f5fd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_280bfced8745fbd9266207463fb41476dc23afff.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_280bfced8745fbd9266207463fb41476dc23afff.hip deleted file mode 100644 index 1f63eb6826a2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_280bfced8745fbd9266207463fb41476dc23afff.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_281d897ad17d7f6db2741b396e6b85a9b8f35286.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_281d897ad17d7f6db2741b396e6b85a9b8f35286.hip deleted file mode 100644 index a501ac769e37..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_281d897ad17d7f6db2741b396e6b85a9b8f35286.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_285e61dad8f63fb973cb2eb899c959e400622652.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_285e61dad8f63fb973cb2eb899c959e400622652.hip deleted file mode 100644 index f7fa0a6b2db2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_285e61dad8f63fb973cb2eb899c959e400622652.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_288458c5a0720ef152848713119ebce6d76db6d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_288458c5a0720ef152848713119ebce6d76db6d6.hip deleted file mode 100644 index 5b20a7375073..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_288458c5a0720ef152848713119ebce6d76db6d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_289071756e7d0582eb61ce6483fa3c988d2e10b5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_289071756e7d0582eb61ce6483fa3c988d2e10b5.hip deleted file mode 100644 index 143b3336918e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_289071756e7d0582eb61ce6483fa3c988d2e10b5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28e4d2c757e4b8c366a2c320360e21ff0ef671a8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28e4d2c757e4b8c366a2c320360e21ff0ef671a8.hip deleted file mode 100644 index 7022269ae247..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28e4d2c757e4b8c366a2c320360e21ff0ef671a8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f1ef32c4384ec26f3dc5e3af6a74fc8cebae92.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f1ef32c4384ec26f3dc5e3af6a74fc8cebae92.hip deleted file mode 100644 index 59fdeb64a10a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f1ef32c4384ec26f3dc5e3af6a74fc8cebae92.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f2e2b108a53308a0cb6c123c8d318cbc2eadb4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f2e2b108a53308a0cb6c123c8d318cbc2eadb4.hip deleted file mode 100644 index 32fd3722d39a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f2e2b108a53308a0cb6c123c8d318cbc2eadb4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f7634d29bef11fd466b452a46b0612f38c949b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f7634d29bef11fd466b452a46b0612f38c949b.hip deleted file mode 100644 index ab520e38d645..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_28f7634d29bef11fd466b452a46b0612f38c949b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_290c484c2a366258941ee0051e139ea716a9de2f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_290c484c2a366258941ee0051e139ea716a9de2f.hip deleted file mode 100644 index 135ce1fc6fb6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_290c484c2a366258941ee0051e139ea716a9de2f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_291a8bdf9d63b112e7fe5fa7e8835a6789cb8ecf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_291a8bdf9d63b112e7fe5fa7e8835a6789cb8ecf.hip deleted file mode 100644 index 3d559a09fb65..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_291a8bdf9d63b112e7fe5fa7e8835a6789cb8ecf.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292454f2d82184ab0491ea0675750c6ec55d659c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292454f2d82184ab0491ea0675750c6ec55d659c.hip deleted file mode 100644 index b0307da1caad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292454f2d82184ab0491ea0675750c6ec55d659c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292b4f995d622826af5d1f2bffa7ba68467c841a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292b4f995d622826af5d1f2bffa7ba68467c841a.hip deleted file mode 100644 index 306043b0f080..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_292b4f995d622826af5d1f2bffa7ba68467c841a.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_295a523f815eb822d66162d4feb75fe0bc50b648.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_295a523f815eb822d66162d4feb75fe0bc50b648.hip deleted file mode 100644 index 1bb39b2a4f68..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_295a523f815eb822d66162d4feb75fe0bc50b648.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_296c5836ba118969c4ba89ed62a98dffe3105738.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_296c5836ba118969c4ba89ed62a98dffe3105738.hip deleted file mode 100644 index 54214111437f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_296c5836ba118969c4ba89ed62a98dffe3105738.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2995d39cd62f20622a31f11a292ed175abb5fdf9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2995d39cd62f20622a31f11a292ed175abb5fdf9.hip deleted file mode 100644 index d8a77346024f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2995d39cd62f20622a31f11a292ed175abb5fdf9.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29bffc159b0bb826ba489ae763dae141bfe8e802.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29bffc159b0bb826ba489ae763dae141bfe8e802.hip deleted file mode 100644 index 26f9d5de83e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29bffc159b0bb826ba489ae763dae141bfe8e802.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29c9e5384809b21f39e78bb2e43af345a9a21d19.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29c9e5384809b21f39e78bb2e43af345a9a21d19.hip deleted file mode 100644 index 87538d2c641a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29c9e5384809b21f39e78bb2e43af345a9a21d19.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29fe68ba10b3480dddc9866c51ca8b5efe962cc3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29fe68ba10b3480dddc9866c51ca8b5efe962cc3.hip deleted file mode 100644 index 82ddd8a6332d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_29fe68ba10b3480dddc9866c51ca8b5efe962cc3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a3a980a26682d879c3a3425f3ba5be3f5761adf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a3a980a26682d879c3a3425f3ba5be3f5761adf.hip deleted file mode 100644 index 01b9f04a43c4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a3a980a26682d879c3a3425f3ba5be3f5761adf.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a45129fc4995abcb8f880692f11c6186fc01641.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a45129fc4995abcb8f880692f11c6186fc01641.hip deleted file mode 100644 index fc6d7c14ada4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a45129fc4995abcb8f880692f11c6186fc01641.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a833fc01e88bd8e256ef64ae8251dd0ed10720b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a833fc01e88bd8e256ef64ae8251dd0ed10720b.hip deleted file mode 100644 index efcc2d280d78..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a833fc01e88bd8e256ef64ae8251dd0ed10720b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a97c457144cb63a9c6c3d6be613b47bd0df9928.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a97c457144cb63a9c6c3d6be613b47bd0df9928.hip deleted file mode 100644 index dafe4de3f38a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2a97c457144cb63a9c6c3d6be613b47bd0df9928.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ad492377add5c8f6d0d2dbf9ee9e4338bbd9f1f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ad492377add5c8f6d0d2dbf9ee9e4338bbd9f1f.hip deleted file mode 100644 index f2f1fccea3e3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ad492377add5c8f6d0d2dbf9ee9e4338bbd9f1f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ae344010d49f7f9a6caab2cb84be7f87d2d96bf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ae344010d49f7f9a6caab2cb84be7f87d2d96bf.hip deleted file mode 100644 index d37e3f4d1772..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ae344010d49f7f9a6caab2cb84be7f87d2d96bf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2af6c5be53732eb1939a2f93232af7dc011dec1a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2af6c5be53732eb1939a2f93232af7dc011dec1a.hip deleted file mode 100644 index 433c8f0cb5cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2af6c5be53732eb1939a2f93232af7dc011dec1a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b0bcb241e5a1be1d35366461408d06e095a26ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b0bcb241e5a1be1d35366461408d06e095a26ef.hip deleted file mode 100644 index 093825411226..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b0bcb241e5a1be1d35366461408d06e095a26ef.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3326e055da32cc979892a2fbd0f7b003cb9f98.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3326e055da32cc979892a2fbd0f7b003cb9f98.hip deleted file mode 100644 index 67290678afc4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3326e055da32cc979892a2fbd0f7b003cb9f98.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3af90387f1d227119c5dcd4b71362940bbce52.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3af90387f1d227119c5dcd4b71362940bbce52.hip deleted file mode 100644 index ff6efd764f80..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b3af90387f1d227119c5dcd4b71362940bbce52.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b4050988e5790a28dbe10b4c20e14f10f6cf85c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b4050988e5790a28dbe10b4c20e14f10f6cf85c.hip deleted file mode 100644 index 96d0bd5d348c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b4050988e5790a28dbe10b4c20e14f10f6cf85c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b49a9b0801a06dd89c7f7182d7590b515df1592.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b49a9b0801a06dd89c7f7182d7590b515df1592.hip deleted file mode 100644 index ce052c1f899e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b49a9b0801a06dd89c7f7182d7590b515df1592.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b50073f6dfeb7ea77d5dce288a1d2f08f8f6362.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b50073f6dfeb7ea77d5dce288a1d2f08f8f6362.hip deleted file mode 100644 index 4381e1bb9c10..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b50073f6dfeb7ea77d5dce288a1d2f08f8f6362.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b5317b6cde327a842170ebff20c2b03d81379ff.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b5317b6cde327a842170ebff20c2b03d81379ff.hip deleted file mode 100644 index d6118d794756..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b5317b6cde327a842170ebff20c2b03d81379ff.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b8169ce4b4b9a17ac96fbb232e6a93f22071ab4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b8169ce4b4b9a17ac96fbb232e6a93f22071ab4.hip deleted file mode 100644 index d2c2c39e21f5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b8169ce4b4b9a17ac96fbb232e6a93f22071ab4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b823c3b99e7c8d1cdc39a5dbc7365a383bf9ccb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b823c3b99e7c8d1cdc39a5dbc7365a383bf9ccb.hip deleted file mode 100644 index 8747df7c06ca..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2b823c3b99e7c8d1cdc39a5dbc7365a383bf9ccb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ba934408c75da5479cc41f96b98ea7d333635ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ba934408c75da5479cc41f96b98ea7d333635ea.hip deleted file mode 100644 index fee242089c6c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ba934408c75da5479cc41f96b98ea7d333635ea.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2bb6da1095bd8669c0e48b5cd808cf0dcefa2674.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2bb6da1095bd8669c0e48b5cd808cf0dcefa2674.hip deleted file mode 100644 index fc42bd2c5183..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2bb6da1095bd8669c0e48b5cd808cf0dcefa2674.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c0bda0feaade2b554d648d72f219ac9c389bf09.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c0bda0feaade2b554d648d72f219ac9c389bf09.hip deleted file mode 100644 index d565c77b50ef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c0bda0feaade2b554d648d72f219ac9c389bf09.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c2e75e6f659a500dd3cf2cfd65118f111342119.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c2e75e6f659a500dd3cf2cfd65118f111342119.hip deleted file mode 100644 index 878e0d921d3f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c2e75e6f659a500dd3cf2cfd65118f111342119.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c77bd7e89ed832cc31b2995566a49bec6e4cb52.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c77bd7e89ed832cc31b2995566a49bec6e4cb52.hip deleted file mode 100644 index dc1d525a8788..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c77bd7e89ed832cc31b2995566a49bec6e4cb52.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c7aede7762a524a7a424cc4dc46e43fdedf73a2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c7aede7762a524a7a424cc4dc46e43fdedf73a2.hip deleted file mode 100644 index d2d7919946d2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c7aede7762a524a7a424cc4dc46e43fdedf73a2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c808da5c2514806c2953bb77d5692e5d7c97aa3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c808da5c2514806c2953bb77d5692e5d7c97aa3.hip deleted file mode 100644 index cb3deca9ea0b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c808da5c2514806c2953bb77d5692e5d7c97aa3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c82e3c4e445e1e02f14435e4ca01a90850139a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c82e3c4e445e1e02f14435e4ca01a90850139a4.hip deleted file mode 100644 index aea19e6c1a5a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c82e3c4e445e1e02f14435e4ca01a90850139a4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c9756060ac0e73dbcfc58a9222a78f0283cd029.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c9756060ac0e73dbcfc58a9222a78f0283cd029.hip deleted file mode 100644 index 8ee7f8a3f76f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2c9756060ac0e73dbcfc58a9222a78f0283cd029.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2caba3ab83239e474412fcf89fe0fbef97e51bf1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2caba3ab83239e474412fcf89fe0fbef97e51bf1.hip deleted file mode 100644 index 397ec8705478..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2caba3ab83239e474412fcf89fe0fbef97e51bf1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2cf351fc2c2da4a8e1760a3affc9a5947c6b3bda.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2cf351fc2c2da4a8e1760a3affc9a5947c6b3bda.hip deleted file mode 100644 index adaeaee2d3e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2cf351fc2c2da4a8e1760a3affc9a5947c6b3bda.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d06f77a4054ca615d96636c0e2eba2a89850142.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d06f77a4054ca615d96636c0e2eba2a89850142.hip deleted file mode 100644 index 8c8b346e59a1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d06f77a4054ca615d96636c0e2eba2a89850142.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d1f2d1e57095f756ddd11e8e9d4f6f253e3ffa3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d1f2d1e57095f756ddd11e8e9d4f6f253e3ffa3.hip deleted file mode 100644 index 900b96883a0a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d1f2d1e57095f756ddd11e8e9d4f6f253e3ffa3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d23a26e0a59a8323dd97632e610d24624143fbe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d23a26e0a59a8323dd97632e610d24624143fbe.hip deleted file mode 100644 index bd482b70420c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d23a26e0a59a8323dd97632e610d24624143fbe.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d43460c011b8d5e01ea98c9b8ddce962de59a96.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d43460c011b8d5e01ea98c9b8ddce962de59a96.hip deleted file mode 100644 index f2c62b6037c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d43460c011b8d5e01ea98c9b8ddce962de59a96.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d446754d7000673779d15d3e73039fd3c10a720.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d446754d7000673779d15d3e73039fd3c10a720.hip deleted file mode 100644 index 63b3c9480147..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d446754d7000673779d15d3e73039fd3c10a720.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d7b637e0313cb423b22cd8844cc2997b3ff73e4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d7b637e0313cb423b22cd8844cc2997b3ff73e4.hip deleted file mode 100644 index 610af33793e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d7b637e0313cb423b22cd8844cc2997b3ff73e4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9a04b7f41dd6f0db017157a44790f35c626e2d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9a04b7f41dd6f0db017157a44790f35c626e2d.hip deleted file mode 100644 index 65a7b3e5519e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9a04b7f41dd6f0db017157a44790f35c626e2d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9c659ba43bb907fd4e3e36a50958288bafd1a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9c659ba43bb907fd4e3e36a50958288bafd1a3.hip deleted file mode 100644 index da8986d59ccb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2d9c659ba43bb907fd4e3e36a50958288bafd1a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2da2b905c4ce32234c2af62328adae6b1f9217a8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2da2b905c4ce32234c2af62328adae6b1f9217a8.hip deleted file mode 100644 index 93d7b0d6a826..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2da2b905c4ce32234c2af62328adae6b1f9217a8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2db33b5442d2e0948762b1f2147a321a9d6907be.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2db33b5442d2e0948762b1f2147a321a9d6907be.hip deleted file mode 100644 index a7f9f5371d00..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2db33b5442d2e0948762b1f2147a321a9d6907be.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2dfac5a83def98340c8786d55a30a98ad68b9eed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2dfac5a83def98340c8786d55a30a98ad68b9eed.hip deleted file mode 100644 index d9d2e9b7b664..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2dfac5a83def98340c8786d55a30a98ad68b9eed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e30f50071113dc4ab59468d568ac9deb06b0342.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e30f50071113dc4ab59468d568ac9deb06b0342.hip deleted file mode 100644 index 46be21c690e8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e30f50071113dc4ab59468d568ac9deb06b0342.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e43e401abbfb1b6737e4dc822f68421abbc648a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e43e401abbfb1b6737e4dc822f68421abbc648a.hip deleted file mode 100644 index 874de292b43b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e43e401abbfb1b6737e4dc822f68421abbc648a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e8b4260626beeac76c26dbcee3cba1457b30e99.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e8b4260626beeac76c26dbcee3cba1457b30e99.hip deleted file mode 100644 index 1efa353334df..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2e8b4260626beeac76c26dbcee3cba1457b30e99.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ea394a09c8691a534ad2219bedf73724b6dd5ce.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ea394a09c8691a534ad2219bedf73724b6dd5ce.hip deleted file mode 100644 index 005eace593b7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2ea394a09c8691a534ad2219bedf73724b6dd5ce.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2eba937ff6d0302ab013db7349d4feb914107f1f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2eba937ff6d0302ab013db7349d4feb914107f1f.hip deleted file mode 100644 index 4a6be87356cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2eba937ff6d0302ab013db7349d4feb914107f1f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f0247e301a7b076b6ec8a778c3b47e330638963.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f0247e301a7b076b6ec8a778c3b47e330638963.hip deleted file mode 100644 index 691f70712fe1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f0247e301a7b076b6ec8a778c3b47e330638963.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f32f2d658f1f69840fbad511ce8a3851c859d52.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f32f2d658f1f69840fbad511ce8a3851c859d52.hip deleted file mode 100644 index 74323057aa21..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f32f2d658f1f69840fbad511ce8a3851c859d52.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f55a23a0f24ff7062a4c286944f25d2db3e20a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f55a23a0f24ff7062a4c286944f25d2db3e20a4.hip deleted file mode 100644 index c3a64b0a4f3c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_2f55a23a0f24ff7062a4c286944f25d2db3e20a4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30024440e780fdf9ec94deccc85216d8bbb5788a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30024440e780fdf9ec94deccc85216d8bbb5788a.hip deleted file mode 100644 index 4171a7a06c93..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30024440e780fdf9ec94deccc85216d8bbb5788a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_303b7b04496e4db7c1ba2436485dc7c8a4c88448.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_303b7b04496e4db7c1ba2436485dc7c8a4c88448.hip deleted file mode 100644 index 97d97de0f938..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_303b7b04496e4db7c1ba2436485dc7c8a4c88448.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3076a6de0e2612279e0ed64612f7393856bcc9ac.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3076a6de0e2612279e0ed64612f7393856bcc9ac.hip deleted file mode 100644 index 2c580f2b0585..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3076a6de0e2612279e0ed64612f7393856bcc9ac.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30c8e4d5c761fda50e010da779e8e4730051d403.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30c8e4d5c761fda50e010da779e8e4730051d403.hip deleted file mode 100644 index c14d1116af80..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30c8e4d5c761fda50e010da779e8e4730051d403.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30f0200092b0e18d57a9f5e512d565f1c0229436.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30f0200092b0e18d57a9f5e512d565f1c0229436.hip deleted file mode 100644 index 51f21a30f169..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_30f0200092b0e18d57a9f5e512d565f1c0229436.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3108502fd29d3a24b32177bcea968121ee809115.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3108502fd29d3a24b32177bcea968121ee809115.hip deleted file mode 100644 index 4e454ce1be03..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3108502fd29d3a24b32177bcea968121ee809115.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3110540b50e95e99a5cccebe47d9d3a83093c2fb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3110540b50e95e99a5cccebe47d9d3a83093c2fb.hip deleted file mode 100644 index 668b2420517d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3110540b50e95e99a5cccebe47d9d3a83093c2fb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311104394c8bef8d4ecff35c1409221e723a5a8a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311104394c8bef8d4ecff35c1409221e723a5a8a.hip deleted file mode 100644 index 51ba80bf06f8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311104394c8bef8d4ecff35c1409221e723a5a8a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311731442b756308c0a869f21b7b8b103aa613e8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311731442b756308c0a869f21b7b8b103aa613e8.hip deleted file mode 100644 index 29da0a953166..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_311731442b756308c0a869f21b7b8b103aa613e8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31222e158484773d2257f4a31e3dfbdb68336a8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31222e158484773d2257f4a31e3dfbdb68336a8e.hip deleted file mode 100644 index 8ee030c03d3a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31222e158484773d2257f4a31e3dfbdb68336a8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3163272d25bc2db2ffaa1fea87648b45ee68d408.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3163272d25bc2db2ffaa1fea87648b45ee68d408.hip deleted file mode 100644 index cb431db18d6e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3163272d25bc2db2ffaa1fea87648b45ee68d408.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_319df310195191895005b30151da8c1afab6c82f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_319df310195191895005b30151da8c1afab6c82f.hip deleted file mode 100644 index 61c6df30042a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_319df310195191895005b30151da8c1afab6c82f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31a968898f0bc6366313e41eddb5e3a3ed12dc98.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31a968898f0bc6366313e41eddb5e3a3ed12dc98.hip deleted file mode 100644 index bd80920d0605..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31a968898f0bc6366313e41eddb5e3a3ed12dc98.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31b807c48c472e9b1311a6037cd98e21d6706889.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31b807c48c472e9b1311a6037cd98e21d6706889.hip deleted file mode 100644 index a40f5ce8b0c7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31b807c48c472e9b1311a6037cd98e21d6706889.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c3760f5978baf9780ce4587ae4c768af0e49d1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c3760f5978baf9780ce4587ae4c768af0e49d1.hip deleted file mode 100644 index 544d7460be06..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c3760f5978baf9780ce4587ae4c768af0e49d1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c4b866692ba5c3d115482bef4790733863c1fc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c4b866692ba5c3d115482bef4790733863c1fc.hip deleted file mode 100644 index 6c35f5dcc1b7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_31c4b866692ba5c3d115482bef4790733863c1fc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3206cc121ce8955ed59ea3b12b858ee2e0cf82f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3206cc121ce8955ed59ea3b12b858ee2e0cf82f8.hip deleted file mode 100644 index 9d582e34820a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3206cc121ce8955ed59ea3b12b858ee2e0cf82f8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_320a6196b662a1d3dc7441a9536d825dc356b95d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_320a6196b662a1d3dc7441a9536d825dc356b95d.hip deleted file mode 100644 index 5545873a7c94..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_320a6196b662a1d3dc7441a9536d825dc356b95d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_321500dd4c41e4d68834814a48a639f5ca36a2fb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_321500dd4c41e4d68834814a48a639f5ca36a2fb.hip deleted file mode 100644 index d20025a34548..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_321500dd4c41e4d68834814a48a639f5ca36a2fb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_322a86568f89a5a5a165cfffbae9ca6949f2477e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_322a86568f89a5a5a165cfffbae9ca6949f2477e.hip deleted file mode 100644 index c9b29a04bcf8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_322a86568f89a5a5a165cfffbae9ca6949f2477e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32438250078ba2a47345ec4955dafb4e4de78a25.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32438250078ba2a47345ec4955dafb4e4de78a25.hip deleted file mode 100644 index f628d4b2258f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32438250078ba2a47345ec4955dafb4e4de78a25.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32527660fa7aeb9a951a9f2fc3c53989bd141c48.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32527660fa7aeb9a951a9f2fc3c53989bd141c48.hip deleted file mode 100644 index f7d57096f4cd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32527660fa7aeb9a951a9f2fc3c53989bd141c48.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_325fbcb9e503e68fafea08abf86a4951f440850f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_325fbcb9e503e68fafea08abf86a4951f440850f.hip deleted file mode 100644 index 02396c643c0d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_325fbcb9e503e68fafea08abf86a4951f440850f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32652a27e8605cef59c8341813b68e7513be23c5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32652a27e8605cef59c8341813b68e7513be23c5.hip deleted file mode 100644 index b2be8c4138c5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_32652a27e8605cef59c8341813b68e7513be23c5.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_327e27892bc57f3dec0da24f94f2a483d6c9321b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_327e27892bc57f3dec0da24f94f2a483d6c9321b.hip deleted file mode 100644 index 1921d88bb6be..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_327e27892bc57f3dec0da24f94f2a483d6c9321b.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_328a311bafd1c153525393b252e4170f8aafb370.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_328a311bafd1c153525393b252e4170f8aafb370.hip deleted file mode 100644 index 45eda877ecc6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_328a311bafd1c153525393b252e4170f8aafb370.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33099fcfc218ffdf69edb4f2f0e46121bea9fafc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33099fcfc218ffdf69edb4f2f0e46121bea9fafc.hip deleted file mode 100644 index 0b8494da39b6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33099fcfc218ffdf69edb4f2f0e46121bea9fafc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33746071156e9ad46f403a539dc237e0a44122a7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33746071156e9ad46f403a539dc237e0a44122a7.hip deleted file mode 100644 index 478b0905146b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33746071156e9ad46f403a539dc237e0a44122a7.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33e7c1e5f41a451c7baff54f7238b220f1bdf8a1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33e7c1e5f41a451c7baff54f7238b220f1bdf8a1.hip deleted file mode 100644 index 277d1147b742..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_33e7c1e5f41a451c7baff54f7238b220f1bdf8a1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3400f0af03743dce328486f8fc805dd30bd6da31.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3400f0af03743dce328486f8fc805dd30bd6da31.hip deleted file mode 100644 index 32abdd8b2e49..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3400f0af03743dce328486f8fc805dd30bd6da31.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3408103188e27b3bc55dce0c1716c0b4d32d6494.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3408103188e27b3bc55dce0c1716c0b4d32d6494.hip deleted file mode 100644 index 3ea3e0e3f944..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3408103188e27b3bc55dce0c1716c0b4d32d6494.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_342d29c85070f488a14b1915f948e5fd69019c99.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_342d29c85070f488a14b1915f948e5fd69019c99.hip deleted file mode 100644 index f790620e2ab5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_342d29c85070f488a14b1915f948e5fd69019c99.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_344932e2655d7b32704be8de9a63bbd8c3369f02.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_344932e2655d7b32704be8de9a63bbd8c3369f02.hip deleted file mode 100644 index 72afe37e6a95..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_344932e2655d7b32704be8de9a63bbd8c3369f02.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345a939a2491166dc520e9a2b9de7e43671e0c2b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345a939a2491166dc520e9a2b9de7e43671e0c2b.hip deleted file mode 100644 index deb452c60078..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345a939a2491166dc520e9a2b9de7e43671e0c2b.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345ea796c8d97bfe3b7c9663bf15e2e5e7696235.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345ea796c8d97bfe3b7c9663bf15e2e5e7696235.hip deleted file mode 100644 index 8a68e8a8d228..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_345ea796c8d97bfe3b7c9663bf15e2e5e7696235.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34807a8e90bf1cd839f32fd718afa6469c35a4fa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34807a8e90bf1cd839f32fd718afa6469c35a4fa.hip deleted file mode 100644 index 72f31d894cdc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34807a8e90bf1cd839f32fd718afa6469c35a4fa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_349241529745bf138552f49d9a93db418663ad65.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_349241529745bf138552f49d9a93db418663ad65.hip deleted file mode 100644 index 7b3003046a12..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_349241529745bf138552f49d9a93db418663ad65.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34c2db98d8e2e690f499f41cfd5afb831b756f54.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34c2db98d8e2e690f499f41cfd5afb831b756f54.hip deleted file mode 100644 index 964ca3a46d0a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_34c2db98d8e2e690f499f41cfd5afb831b756f54.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3511c54e6a6f9eec378d8b661121066536195d3a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3511c54e6a6f9eec378d8b661121066536195d3a.hip deleted file mode 100644 index 37c6017eaed7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3511c54e6a6f9eec378d8b661121066536195d3a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_351425a006aeeff4d69c8570cb6bf1e1427d2c21.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_351425a006aeeff4d69c8570cb6bf1e1427d2c21.hip deleted file mode 100644 index fe7c0cb8523f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_351425a006aeeff4d69c8570cb6bf1e1427d2c21.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_354121d3bad1d448bd413718fa096f54faa12e95.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_354121d3bad1d448bd413718fa096f54faa12e95.hip deleted file mode 100644 index 199467f7c90b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_354121d3bad1d448bd413718fa096f54faa12e95.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_356f83cb96d0313abcdb24955edd4264df72aed7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_356f83cb96d0313abcdb24955edd4264df72aed7.hip deleted file mode 100644 index e424503f2857..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_356f83cb96d0313abcdb24955edd4264df72aed7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_357f7e626135cc9176a295f3d1f336a7c3852688.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_357f7e626135cc9176a295f3d1f336a7c3852688.hip deleted file mode 100644 index b8f17aa00210..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_357f7e626135cc9176a295f3d1f336a7c3852688.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358399e756ed5026baf3ab78af17489dc07b9532.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358399e756ed5026baf3ab78af17489dc07b9532.hip deleted file mode 100644 index bff4d31f4e6c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358399e756ed5026baf3ab78af17489dc07b9532.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358d28c958c0a831a615a4811d13279b18db09c4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358d28c958c0a831a615a4811d13279b18db09c4.hip deleted file mode 100644 index 48de526f8be4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_358d28c958c0a831a615a4811d13279b18db09c4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3642b78913a853a62dbff8b99d9ae3fa458f461d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3642b78913a853a62dbff8b99d9ae3fa458f461d.hip deleted file mode 100644 index dc0ebb7be98f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3642b78913a853a62dbff8b99d9ae3fa458f461d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_366662dccf2f650bcd8123c49006c759cd4c0ef6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_366662dccf2f650bcd8123c49006c759cd4c0ef6.hip deleted file mode 100644 index 92feaaba9d49..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_366662dccf2f650bcd8123c49006c759cd4c0ef6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_367e58867c46d96c9bbaa96eaaa9f93595c9e099.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_367e58867c46d96c9bbaa96eaaa9f93595c9e099.hip deleted file mode 100644 index f45066e7e6f9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_367e58867c46d96c9bbaa96eaaa9f93595c9e099.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_36a0a960541bd8a2dc6741579de685b7c0a5f6d7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_36a0a960541bd8a2dc6741579de685b7c0a5f6d7.hip deleted file mode 100644 index 60a96f173203..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_36a0a960541bd8a2dc6741579de685b7c0a5f6d7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_377b70f54cb2778b5ce3df936b477f775eea8b3c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_377b70f54cb2778b5ce3df936b477f775eea8b3c.hip deleted file mode 100644 index 06085da0febc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_377b70f54cb2778b5ce3df936b477f775eea8b3c.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378759ae25465c32960487375828e23c5f1ac869.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378759ae25465c32960487375828e23c5f1ac869.hip deleted file mode 100644 index 41b799af35f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378759ae25465c32960487375828e23c5f1ac869.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378bf438642e5d863e31145ada2a0688059aa5d9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378bf438642e5d863e31145ada2a0688059aa5d9.hip deleted file mode 100644 index 6ef04047e978..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_378bf438642e5d863e31145ada2a0688059aa5d9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37ad61bf8427a26775969f8a9166fd0bfb7446b4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37ad61bf8427a26775969f8a9166fd0bfb7446b4.hip deleted file mode 100644 index 104d78fd220b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37ad61bf8427a26775969f8a9166fd0bfb7446b4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37fe04467e87ec2110f60c7aea0cc9bf2ca07481.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37fe04467e87ec2110f60c7aea0cc9bf2ca07481.hip deleted file mode 100644 index c497fbc81508..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_37fe04467e87ec2110f60c7aea0cc9bf2ca07481.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38010c9bf7341588f071f889b7a0b4dcc4e7a14c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38010c9bf7341588f071f889b7a0b4dcc4e7a14c.hip deleted file mode 100644 index ff242d6a34d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38010c9bf7341588f071f889b7a0b4dcc4e7a14c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_381b29d9888365bff0f109d897b508eebfd8a61f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_381b29d9888365bff0f109d897b508eebfd8a61f.hip deleted file mode 100644 index 1512b864eea0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_381b29d9888365bff0f109d897b508eebfd8a61f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3824e97d5ecba46e06d5ec1a9456c810d80227a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3824e97d5ecba46e06d5ec1a9456c810d80227a3.hip deleted file mode 100644 index ac2f0811a97d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3824e97d5ecba46e06d5ec1a9456c810d80227a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38273a2f8e6bbb42ba0b0871b6c95abb34531f33.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38273a2f8e6bbb42ba0b0871b6c95abb34531f33.hip deleted file mode 100644 index c967125e7ade..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38273a2f8e6bbb42ba0b0871b6c95abb34531f33.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38a5ff72f22e0ad040a281e66b1aca0bf3a2aadb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38a5ff72f22e0ad040a281e66b1aca0bf3a2aadb.hip deleted file mode 100644 index 753455f974fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38a5ff72f22e0ad040a281e66b1aca0bf3a2aadb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38abcbeaa4d33d3150f2b0238bb62ebbfe960980.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38abcbeaa4d33d3150f2b0238bb62ebbfe960980.hip deleted file mode 100644 index 6d38f8c7d276..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38abcbeaa4d33d3150f2b0238bb62ebbfe960980.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38b94d76503e13c911781169fbc378517332c42e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38b94d76503e13c911781169fbc378517332c42e.hip deleted file mode 100644 index 1e5bdfdeecdc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38b94d76503e13c911781169fbc378517332c42e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38bb367362fe2c4849ded728ec5dd00969ce188f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38bb367362fe2c4849ded728ec5dd00969ce188f.hip deleted file mode 100644 index bdf3d950c576..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38bb367362fe2c4849ded728ec5dd00969ce188f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38e12dad9e3bafe177ed3c27c833825813e18fc3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38e12dad9e3bafe177ed3c27c833825813e18fc3.hip deleted file mode 100644 index 4ab3ddfdda93..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38e12dad9e3bafe177ed3c27c833825813e18fc3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38f8a89468cf9c8606cf12a930db062a83cd0ea0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38f8a89468cf9c8606cf12a930db062a83cd0ea0.hip deleted file mode 100644 index b173f8bb50dd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_38f8a89468cf9c8606cf12a930db062a83cd0ea0.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3937d9dfb68351de2942e32f35e2ca1ce71edfa8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3937d9dfb68351de2942e32f35e2ca1ce71edfa8.hip deleted file mode 100644 index 7c37e2cd7210..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3937d9dfb68351de2942e32f35e2ca1ce71edfa8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39422621a00ff79b2f5ec0dafb957c77693537b3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39422621a00ff79b2f5ec0dafb957c77693537b3.hip deleted file mode 100644 index e1815966996c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39422621a00ff79b2f5ec0dafb957c77693537b3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3967a8807c9451b09227c0f685c18aafeb062fd2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3967a8807c9451b09227c0f685c18aafeb062fd2.hip deleted file mode 100644 index 227c824a0340..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3967a8807c9451b09227c0f685c18aafeb062fd2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3992d5df4ba2e999caf6889a852db4e1ba078e65.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3992d5df4ba2e999caf6889a852db4e1ba078e65.hip deleted file mode 100644 index 33029ba1d5e8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3992d5df4ba2e999caf6889a852db4e1ba078e65.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39d3071347a0c98f3221104036f477aa13bffa4d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39d3071347a0c98f3221104036f477aa13bffa4d.hip deleted file mode 100644 index a2f7bc3cba44..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_39d3071347a0c98f3221104036f477aa13bffa4d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a1dca5feb864e8981387c2d07e62acef1730aa8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a1dca5feb864e8981387c2d07e62acef1730aa8.hip deleted file mode 100644 index 3b47770a5d3e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a1dca5feb864e8981387c2d07e62acef1730aa8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2280997eb6f1d091094fc54cecf42b7c9c3a2d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2280997eb6f1d091094fc54cecf42b7c9c3a2d.hip deleted file mode 100644 index 085072ead4c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2280997eb6f1d091094fc54cecf42b7c9c3a2d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2643099365d0903c799585f41dc1a525ac9f9e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2643099365d0903c799585f41dc1a525ac9f9e.hip deleted file mode 100644 index 47ed4c777f34..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a2643099365d0903c799585f41dc1a525ac9f9e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a6b9566559ed2b1c85f2bea1c55e72c41dc47bd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a6b9566559ed2b1c85f2bea1c55e72c41dc47bd.hip deleted file mode 100644 index b5ed7194c0d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3a6b9566559ed2b1c85f2bea1c55e72c41dc47bd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3af86f458fb4dfcceb7db3357fbae0dc15142a15.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3af86f458fb4dfcceb7db3357fbae0dc15142a15.hip deleted file mode 100644 index 2447c84a72d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3af86f458fb4dfcceb7db3357fbae0dc15142a15.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3afbb5ac9048a962a60f48886728220ae6c2aeaf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3afbb5ac9048a962a60f48886728220ae6c2aeaf.hip deleted file mode 100644 index c356a161aeef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3afbb5ac9048a962a60f48886728220ae6c2aeaf.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b26eafe76cca8e74e819220b6de1f4279d48e43.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b26eafe76cca8e74e819220b6de1f4279d48e43.hip deleted file mode 100644 index 3ef4c0344bb4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b26eafe76cca8e74e819220b6de1f4279d48e43.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b4ecb47f9ebe8c2784976c3e9bbe4834b475cf1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b4ecb47f9ebe8c2784976c3e9bbe4834b475cf1.hip deleted file mode 100644 index 5231a4a905c4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b4ecb47f9ebe8c2784976c3e9bbe4834b475cf1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b508b92f7e123b21658f6e17d624ffa87831fee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b508b92f7e123b21658f6e17d624ffa87831fee.hip deleted file mode 100644 index f4eec82b6d6e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b508b92f7e123b21658f6e17d624ffa87831fee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b5b3c218e4a7b459e54080e24c5b730221eac02.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b5b3c218e4a7b459e54080e24c5b730221eac02.hip deleted file mode 100644 index 88e828ab4e51..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3b5b3c218e4a7b459e54080e24c5b730221eac02.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb129e6dee6848043dd0e8fa812ae80fec4d014.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb129e6dee6848043dd0e8fa812ae80fec4d014.hip deleted file mode 100644 index 5c9a66cff04c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb129e6dee6848043dd0e8fa812ae80fec4d014.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb3b682eab96e4e173affad75b9d8e73f1dd690.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb3b682eab96e4e173affad75b9d8e73f1dd690.hip deleted file mode 100644 index 9e9c2703d250..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bb3b682eab96e4e173affad75b9d8e73f1dd690.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3be7cea6df8e6dd56194e1172f28943667f1c4ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3be7cea6df8e6dd56194e1172f28943667f1c4ef.hip deleted file mode 100644 index 97f8bdbeba39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3be7cea6df8e6dd56194e1172f28943667f1c4ef.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bed3aaf24c73073c604a3b23bb4b0358b8e3490.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bed3aaf24c73073c604a3b23bb4b0358b8e3490.hip deleted file mode 100644 index 739a87ca491c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3bed3aaf24c73073c604a3b23bb4b0358b8e3490.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c1454ffc1418dac641f63671e947d9f550b1f0c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c1454ffc1418dac641f63671e947d9f550b1f0c.hip deleted file mode 100644 index 0578fcf163d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c1454ffc1418dac641f63671e947d9f550b1f0c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c38bb80e9880335faaea81985ed5d0e713ecb08.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c38bb80e9880335faaea81985ed5d0e713ecb08.hip deleted file mode 100644 index 98903225cd75..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c38bb80e9880335faaea81985ed5d0e713ecb08.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c3b7e4b8c1efe59f79a15512716fce2282a79a7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c3b7e4b8c1efe59f79a15512716fce2282a79a7.hip deleted file mode 100644 index f98b69cdec05..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c3b7e4b8c1efe59f79a15512716fce2282a79a7.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c64c33870ebc329921cfa3867d58b1857421f65.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c64c33870ebc329921cfa3867d58b1857421f65.hip deleted file mode 100644 index fcdfd087f3b4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3c64c33870ebc329921cfa3867d58b1857421f65.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cb0cee09d633b6f70febbba63a1e090522cfb4a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cb0cee09d633b6f70febbba63a1e090522cfb4a.hip deleted file mode 100644 index dd315917436f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cb0cee09d633b6f70febbba63a1e090522cfb4a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cce3baac1e3ca03af0c3f4ee4d0158ad1031e9f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cce3baac1e3ca03af0c3f4ee4d0158ad1031e9f.hip deleted file mode 100644 index a83180734296..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cce3baac1e3ca03af0c3f4ee4d0158ad1031e9f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ccf0a9d5a5451da5dbf6075ccea45e4a140550a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ccf0a9d5a5451da5dbf6075ccea45e4a140550a.hip deleted file mode 100644 index 9e3620d44cdc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ccf0a9d5a5451da5dbf6075ccea45e4a140550a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cd7a9ca49c1149d46f6b05b0fefc41ecaeb6ea1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cd7a9ca49c1149d46f6b05b0fefc41ecaeb6ea1.hip deleted file mode 100644 index 59adc0e9a6bc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cd7a9ca49c1149d46f6b05b0fefc41ecaeb6ea1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cf45927b6d931e31e2209685d787efa28eed8ba.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cf45927b6d931e31e2209685d787efa28eed8ba.hip deleted file mode 100644 index 6d952df6c155..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3cf45927b6d931e31e2209685d787efa28eed8ba.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d1cea88a2277b87d405025ba256272a1720f88d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d1cea88a2277b87d405025ba256272a1720f88d.hip deleted file mode 100644 index 1175c6ae5cc1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d1cea88a2277b87d405025ba256272a1720f88d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d289100991d4c8c362f64c8f6c4ba395c2f3495.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d289100991d4c8c362f64c8f6c4ba395c2f3495.hip deleted file mode 100644 index 521bb0635159..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d289100991d4c8c362f64c8f6c4ba395c2f3495.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d3f3eb2f5eb1f3287879604892b1c230df85f1d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d3f3eb2f5eb1f3287879604892b1c230df85f1d.hip deleted file mode 100644 index 6aae42b410cd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d3f3eb2f5eb1f3287879604892b1c230df85f1d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d45624dc6e33c477c73a155500b015b6c010de8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d45624dc6e33c477c73a155500b015b6c010de8.hip deleted file mode 100644 index ed04a414d953..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d45624dc6e33c477c73a155500b015b6c010de8.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d55cb42b0096a8ae338ce100f86e378aa1a04c9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d55cb42b0096a8ae338ce100f86e378aa1a04c9.hip deleted file mode 100644 index 79a154ab3b08..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3d55cb42b0096a8ae338ce100f86e378aa1a04c9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3da8c31f6d5bcaacfa4a21aed4d1d3caecb48922.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3da8c31f6d5bcaacfa4a21aed4d1d3caecb48922.hip deleted file mode 100644 index 80f9ab1612ad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3da8c31f6d5bcaacfa4a21aed4d1d3caecb48922.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dba3cd44f78c950fe7ceaa5f0629dfc607b30f1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dba3cd44f78c950fe7ceaa5f0629dfc607b30f1.hip deleted file mode 100644 index 625575ee0b03..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dba3cd44f78c950fe7ceaa5f0629dfc607b30f1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dff884e176ec7cff86d17c6afe1ddaa4dd6007d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dff884e176ec7cff86d17c6afe1ddaa4dd6007d.hip deleted file mode 100644 index 42332838f992..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3dff884e176ec7cff86d17c6afe1ddaa4dd6007d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e143d88eaa0d9cfea856b2f3a57d1275a656627.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e143d88eaa0d9cfea856b2f3a57d1275a656627.hip deleted file mode 100644 index ca6dd81fba63..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e143d88eaa0d9cfea856b2f3a57d1275a656627.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e2557f206fd81d82a3b9d59113105040beb891f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e2557f206fd81d82a3b9d59113105040beb891f.hip deleted file mode 100644 index 6fc591418cc5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e2557f206fd81d82a3b9d59113105040beb891f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e562e6c3af28b8478020ce3c3bf73c036001c93.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e562e6c3af28b8478020ce3c3bf73c036001c93.hip deleted file mode 100644 index 0c23ebd4c25b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e562e6c3af28b8478020ce3c3bf73c036001c93.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e61b019e1398a6a3c36143fb84b5ff22c9f4508.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e61b019e1398a6a3c36143fb84b5ff22c9f4508.hip deleted file mode 100644 index 877e2efbc2d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e61b019e1398a6a3c36143fb84b5ff22c9f4508.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e839660557dee9d5bcda9b56940ce23236c5f6d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e839660557dee9d5bcda9b56940ce23236c5f6d.hip deleted file mode 100644 index 66f78a8f3736..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3e839660557dee9d5bcda9b56940ce23236c5f6d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3eb2ea922daabbba131b90713e06d8caf5f30662.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3eb2ea922daabbba131b90713e06d8caf5f30662.hip deleted file mode 100644 index a358749f938b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3eb2ea922daabbba131b90713e06d8caf5f30662.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ecf565a5a1c4a09887c67ac3b9a019dca427ac0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ecf565a5a1c4a09887c67ac3b9a019dca427ac0.hip deleted file mode 100644 index f1e2897cb4a9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3ecf565a5a1c4a09887c67ac3b9a019dca427ac0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f34433b784d1e405ade3378918641372a30bf6b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f34433b784d1e405ade3378918641372a30bf6b.hip deleted file mode 100644 index 5c539047c4a3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f34433b784d1e405ade3378918641372a30bf6b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f5e01b4f2ca8ea10898c39d6570bd74e85f46ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f5e01b4f2ca8ea10898c39d6570bd74e85f46ed.hip deleted file mode 100644 index fd32ab0e232f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f5e01b4f2ca8ea10898c39d6570bd74e85f46ed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f7315955f555768f24585a50d75e216c40f062d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f7315955f555768f24585a50d75e216c40f062d.hip deleted file mode 100644 index f8cd7b38bd48..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3f7315955f555768f24585a50d75e216c40f062d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fad30ff0739ab5dede67a96e859f8c474c245f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fad30ff0739ab5dede67a96e859f8c474c245f8.hip deleted file mode 100644 index 2b7997908e84..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fad30ff0739ab5dede67a96e859f8c474c245f8.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fcc6893456a559c7d22714116022fc69b372266.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fcc6893456a559c7d22714116022fc69b372266.hip deleted file mode 100644 index cde002264828..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_3fcc6893456a559c7d22714116022fc69b372266.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018b1fcee808b6cccd131418b6ae9e8bf900d8f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018b1fcee808b6cccd131418b6ae9e8bf900d8f.hip deleted file mode 100644 index 881592ca3c96..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018b1fcee808b6cccd131418b6ae9e8bf900d8f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018f690b6322588041bb467beabd8a7bc79a2e0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018f690b6322588041bb467beabd8a7bc79a2e0.hip deleted file mode 100644 index 5f2553e570fe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4018f690b6322588041bb467beabd8a7bc79a2e0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40357c5e9739eae136a7abf92bc38d3ac94753f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40357c5e9739eae136a7abf92bc38d3ac94753f8.hip deleted file mode 100644 index ac0f1bb12c2a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40357c5e9739eae136a7abf92bc38d3ac94753f8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4052ca6a3ec02f6559e4bbf1edde42ad2d127c26.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4052ca6a3ec02f6559e4bbf1edde42ad2d127c26.hip deleted file mode 100644 index e45c558d932f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4052ca6a3ec02f6559e4bbf1edde42ad2d127c26.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_405e7efa263223148318ae96bd1929b382e994e1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_405e7efa263223148318ae96bd1929b382e994e1.hip deleted file mode 100644 index 7e0b78f9e1fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_405e7efa263223148318ae96bd1929b382e994e1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40aa64439b80ff8dd12498b3e5f6b625da16e285.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40aa64439b80ff8dd12498b3e5f6b625da16e285.hip deleted file mode 100644 index 827eb5e3c725..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40aa64439b80ff8dd12498b3e5f6b625da16e285.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40db688a9189e1c47c300d474df946a248a63303.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40db688a9189e1c47c300d474df946a248a63303.hip deleted file mode 100644 index dbaef536fa3a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_40db688a9189e1c47c300d474df946a248a63303.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4118e3ab290263ed2576feaf22a1944bf2ddcb7a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4118e3ab290263ed2576feaf22a1944bf2ddcb7a.hip deleted file mode 100644 index c84ce03b742e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4118e3ab290263ed2576feaf22a1944bf2ddcb7a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_415b183c50dd2663dabe3eb8b780913b778c54ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_415b183c50dd2663dabe3eb8b780913b778c54ab.hip deleted file mode 100644 index 9eb2f1e6e0b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_415b183c50dd2663dabe3eb8b780913b778c54ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4160f6b6d0869740a5a411abd80108f729f810eb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4160f6b6d0869740a5a411abd80108f729f810eb.hip deleted file mode 100644 index 4f292068029f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4160f6b6d0869740a5a411abd80108f729f810eb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_417b1cb14b67dc82f614831550f7deb0895bd7e4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_417b1cb14b67dc82f614831550f7deb0895bd7e4.hip deleted file mode 100644 index e0a7ccf88718..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_417b1cb14b67dc82f614831550f7deb0895bd7e4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_419461cdb5687ebbb7bf0be136071d70420c1619.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_419461cdb5687ebbb7bf0be136071d70420c1619.hip deleted file mode 100644 index 9d095aefdff4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_419461cdb5687ebbb7bf0be136071d70420c1619.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41b68458076e6cb129d3ec793e95b91430a0c8a1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41b68458076e6cb129d3ec793e95b91430a0c8a1.hip deleted file mode 100644 index c532a5560484..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41b68458076e6cb129d3ec793e95b91430a0c8a1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41db3f29d1940e59dadc357c040ea37a6ff208d9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41db3f29d1940e59dadc357c040ea37a6ff208d9.hip deleted file mode 100644 index ab6915fb44e3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_41db3f29d1940e59dadc357c040ea37a6ff208d9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4217a48a1677bd26cd48e512f1fc8830a8a551b8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4217a48a1677bd26cd48e512f1fc8830a8a551b8.hip deleted file mode 100644 index 6a778d4d4a71..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4217a48a1677bd26cd48e512f1fc8830a8a551b8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_428ce4e14cf94b284ffa735fe03d923cc74c9fe0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_428ce4e14cf94b284ffa735fe03d923cc74c9fe0.hip deleted file mode 100644 index fe79b2f2e066..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_428ce4e14cf94b284ffa735fe03d923cc74c9fe0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_429b82a27571ac91e3631cbdb7e0a58155abf962.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_429b82a27571ac91e3631cbdb7e0a58155abf962.hip deleted file mode 100644 index 60db1e29bf6d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_429b82a27571ac91e3631cbdb7e0a58155abf962.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_42e2326066c91452335eac05f25a6311376bd9e5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_42e2326066c91452335eac05f25a6311376bd9e5.hip deleted file mode 100644 index e54c65f62c8a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_42e2326066c91452335eac05f25a6311376bd9e5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4306c6c37cf472ad262f53941611b5e60072bdf6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4306c6c37cf472ad262f53941611b5e60072bdf6.hip deleted file mode 100644 index d6237a519991..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4306c6c37cf472ad262f53941611b5e60072bdf6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4347e039c003489dd528faf5d710e687321a3fd7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4347e039c003489dd528faf5d710e687321a3fd7.hip deleted file mode 100644 index 7e988fe4a36b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4347e039c003489dd528faf5d710e687321a3fd7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4356b3a2ff49f72b91a6b9c215df285f2798ad47.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4356b3a2ff49f72b91a6b9c215df285f2798ad47.hip deleted file mode 100644 index e8cc8478fc0d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4356b3a2ff49f72b91a6b9c215df285f2798ad47.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4377ac04be3a6cbdbfbe57612a469412812fb5b5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4377ac04be3a6cbdbfbe57612a469412812fb5b5.hip deleted file mode 100644 index 421c9e3c0931..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4377ac04be3a6cbdbfbe57612a469412812fb5b5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_438e3565f4c720e6c9691b0d33c1392936e2e7ae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_438e3565f4c720e6c9691b0d33c1392936e2e7ae.hip deleted file mode 100644 index 577d7b2473f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_438e3565f4c720e6c9691b0d33c1392936e2e7ae.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4395d3c96b3f4556b9765fd0a3b5701b2fb10948.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4395d3c96b3f4556b9765fd0a3b5701b2fb10948.hip deleted file mode 100644 index c642a2e79a28..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4395d3c96b3f4556b9765fd0a3b5701b2fb10948.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43e7c78e8f65be35e2753a0ad5123118555c56b2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43e7c78e8f65be35e2753a0ad5123118555c56b2.hip deleted file mode 100644 index f80570bc6209..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43e7c78e8f65be35e2753a0ad5123118555c56b2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43f2156a04b18bab55af60e9357f28d8a4604e8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43f2156a04b18bab55af60e9357f28d8a4604e8e.hip deleted file mode 100644 index 2a52cb365b4c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_43f2156a04b18bab55af60e9357f28d8a4604e8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4409f2a7deb027e864afdfc9975d3ab93c5dcc9a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4409f2a7deb027e864afdfc9975d3ab93c5dcc9a.hip deleted file mode 100644 index b7efdaef3101..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4409f2a7deb027e864afdfc9975d3ab93c5dcc9a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4432c5214c4d40c54ca2d02f0d4785c6d6902370.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4432c5214c4d40c54ca2d02f0d4785c6d6902370.hip deleted file mode 100644 index c6a83f8b04af..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4432c5214c4d40c54ca2d02f0d4785c6d6902370.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44462715ed5f192532760d6f4c66ff9d4e20e254.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44462715ed5f192532760d6f4c66ff9d4e20e254.hip deleted file mode 100644 index 6849fbe134fc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44462715ed5f192532760d6f4c66ff9d4e20e254.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44564dddf8b492d80be54854abb8d1d831e42679.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44564dddf8b492d80be54854abb8d1d831e42679.hip deleted file mode 100644 index dcdc63220906..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44564dddf8b492d80be54854abb8d1d831e42679.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445cd8fa559588f4264ce6192f2de3e3065365ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445cd8fa559588f4264ce6192f2de3e3065365ea.hip deleted file mode 100644 index b2ec654b4666..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445cd8fa559588f4264ce6192f2de3e3065365ea.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445e28a8a51cd435130ded2abc9fc606e522c713.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445e28a8a51cd435130ded2abc9fc606e522c713.hip deleted file mode 100644 index 339dd8b7bd3f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_445e28a8a51cd435130ded2abc9fc606e522c713.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4462b192a64efb60d5484798526278ac7a0fb9fa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4462b192a64efb60d5484798526278ac7a0fb9fa.hip deleted file mode 100644 index 61c70a8e60da..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4462b192a64efb60d5484798526278ac7a0fb9fa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4466b6c6b2ec3acb40ac1cda432efa1e4e62d9d9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4466b6c6b2ec3acb40ac1cda432efa1e4e62d9d9.hip deleted file mode 100644 index dff2e53698f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4466b6c6b2ec3acb40ac1cda432efa1e4e62d9d9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44690e48f30657b0fcfa26fb3b9af3ef76e792e3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44690e48f30657b0fcfa26fb3b9af3ef76e792e3.hip deleted file mode 100644 index 23e4a4ab96ac..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44690e48f30657b0fcfa26fb3b9af3ef76e792e3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44c181996532676f2140fd026707135144e9d37b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44c181996532676f2140fd026707135144e9d37b.hip deleted file mode 100644 index 77ec3c0090b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44c181996532676f2140fd026707135144e9d37b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44cc95831c347212021c0bab7b43acd7daabce42.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44cc95831c347212021c0bab7b43acd7daabce42.hip deleted file mode 100644 index 51848f969301..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44cc95831c347212021c0bab7b43acd7daabce42.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44d82b58fdc3e5b7a7c20490ce7f5acce4e6ec79.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44d82b58fdc3e5b7a7c20490ce7f5acce4e6ec79.hip deleted file mode 100644 index 91b73b0dbb77..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_44d82b58fdc3e5b7a7c20490ce7f5acce4e6ec79.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_451fbbdc2dcf2ec81efce34673ee6c425cc16ca2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_451fbbdc2dcf2ec81efce34673ee6c425cc16ca2.hip deleted file mode 100644 index ec99d02933eb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_451fbbdc2dcf2ec81efce34673ee6c425cc16ca2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4568af1b2f104664fd05d21ad789aed39ecfa42b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4568af1b2f104664fd05d21ad789aed39ecfa42b.hip deleted file mode 100644 index 3182b0b45895..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4568af1b2f104664fd05d21ad789aed39ecfa42b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_457eaffbff3c58183a656687010daa2c16cfc26e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_457eaffbff3c58183a656687010daa2c16cfc26e.hip deleted file mode 100644 index 91674b036d6c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_457eaffbff3c58183a656687010daa2c16cfc26e.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_458d708d13577f2b92e6d5adfe952a87e0cf7be5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_458d708d13577f2b92e6d5adfe952a87e0cf7be5.hip deleted file mode 100644 index b50620ae2c27..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_458d708d13577f2b92e6d5adfe952a87e0cf7be5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459c8fb6028991321b09a990c2188d854d940268.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459c8fb6028991321b09a990c2188d854d940268.hip deleted file mode 100644 index 2eeab7640433..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459c8fb6028991321b09a990c2188d854d940268.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459ea3713aef9b916e1b38a882a45012930924d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459ea3713aef9b916e1b38a882a45012930924d3.hip deleted file mode 100644 index 78376d9504f3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_459ea3713aef9b916e1b38a882a45012930924d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45b9871c220c0065d74bffeed4021d0304a9625c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45b9871c220c0065d74bffeed4021d0304a9625c.hip deleted file mode 100644 index 2374b95a48bc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45b9871c220c0065d74bffeed4021d0304a9625c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45f4363f50af1e7ccd24751d5f5b181bf32c604f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45f4363f50af1e7ccd24751d5f5b181bf32c604f.hip deleted file mode 100644 index 5bc7f4efab03..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_45f4363f50af1e7ccd24751d5f5b181bf32c604f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4601680af41c8738089ff377147e0547dcad114d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4601680af41c8738089ff377147e0547dcad114d.hip deleted file mode 100644 index 8c4c055b966b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4601680af41c8738089ff377147e0547dcad114d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_461737a13e24009bf1a5a4b780175043a9f2e33e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_461737a13e24009bf1a5a4b780175043a9f2e33e.hip deleted file mode 100644 index 7f39d79c0647..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_461737a13e24009bf1a5a4b780175043a9f2e33e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4666db0ff7b035e54f2c0e59acedc2131b722a55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4666db0ff7b035e54f2c0e59acedc2131b722a55.hip deleted file mode 100644 index ea47087bdc0e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4666db0ff7b035e54f2c0e59acedc2131b722a55.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_468a5f057fd5cef2df5f919f5102f47e86901e3b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_468a5f057fd5cef2df5f919f5102f47e86901e3b.hip deleted file mode 100644 index 57e1edfe630e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_468a5f057fd5cef2df5f919f5102f47e86901e3b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_474fe2d739eca8c93fdcb2c105d4154cee6ca1c1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_474fe2d739eca8c93fdcb2c105d4154cee6ca1c1.hip deleted file mode 100644 index 71b4a27dc4c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_474fe2d739eca8c93fdcb2c105d4154cee6ca1c1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47548aa042c69bb9c59a8bf706b44028aaa41830.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47548aa042c69bb9c59a8bf706b44028aaa41830.hip deleted file mode 100644 index d6ce22e239e1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47548aa042c69bb9c59a8bf706b44028aaa41830.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47f3ced9b5ddb0dfee8ed5e7df8eca0bbe273047.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47f3ced9b5ddb0dfee8ed5e7df8eca0bbe273047.hip deleted file mode 100644 index 8a4327297bfa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47f3ced9b5ddb0dfee8ed5e7df8eca0bbe273047.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47fe73f04cef91cd2a0682e905483968ff80eadb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47fe73f04cef91cd2a0682e905483968ff80eadb.hip deleted file mode 100644 index 078ae2b27172..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_47fe73f04cef91cd2a0682e905483968ff80eadb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_481415463f0316ebe25ff2fda47c68cc54db3359.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_481415463f0316ebe25ff2fda47c68cc54db3359.hip deleted file mode 100644 index 1346c8a21eff..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_481415463f0316ebe25ff2fda47c68cc54db3359.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4824e1f8cda50f80988857611da766685da94494.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4824e1f8cda50f80988857611da766685da94494.hip deleted file mode 100644 index f8948c4ee5f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4824e1f8cda50f80988857611da766685da94494.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48280c91d7cd8712fd533e246a6b0f758834abc9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48280c91d7cd8712fd533e246a6b0f758834abc9.hip deleted file mode 100644 index 241555722550..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48280c91d7cd8712fd533e246a6b0f758834abc9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_482e34930d11ff493007b1613993e01acc1af78d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_482e34930d11ff493007b1613993e01acc1af78d.hip deleted file mode 100644 index 983981295cc8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_482e34930d11ff493007b1613993e01acc1af78d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48300e0aeabe337785d4c7b41796ce65df6cc42a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48300e0aeabe337785d4c7b41796ce65df6cc42a.hip deleted file mode 100644 index b09f2f366fe6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48300e0aeabe337785d4c7b41796ce65df6cc42a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_483eaea4096c8f5bee16a64860432f0634a253d8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_483eaea4096c8f5bee16a64860432f0634a253d8.hip deleted file mode 100644 index 9274da55895a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_483eaea4096c8f5bee16a64860432f0634a253d8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48435e5dd23e49e19dd313f9891ffec800ce74c2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48435e5dd23e49e19dd313f9891ffec800ce74c2.hip deleted file mode 100644 index 65272f5721ed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48435e5dd23e49e19dd313f9891ffec800ce74c2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_486f6c7c7655c34b7b9973ff357b0813f0a3fd7c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_486f6c7c7655c34b7b9973ff357b0813f0a3fd7c.hip deleted file mode 100644 index 93b2a7bfd4e9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_486f6c7c7655c34b7b9973ff357b0813f0a3fd7c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_487724686efd35731e5335efa949486c93ae26e3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_487724686efd35731e5335efa949486c93ae26e3.hip deleted file mode 100644 index 765e398fd8d1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_487724686efd35731e5335efa949486c93ae26e3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_489e7be0f85656d012a6451b65f6c1d2613b187d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_489e7be0f85656d012a6451b65f6c1d2613b187d.hip deleted file mode 100644 index 6b4f865839da..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_489e7be0f85656d012a6451b65f6c1d2613b187d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48ae3af78583258c4b13c11a442022e0e058bb85.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48ae3af78583258c4b13c11a442022e0e058bb85.hip deleted file mode 100644 index 4d17028eaba2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48ae3af78583258c4b13c11a442022e0e058bb85.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48d7d145f96aa8958a9208d0c8887742a8c834fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48d7d145f96aa8958a9208d0c8887742a8c834fd.hip deleted file mode 100644 index e236e9745ee3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48d7d145f96aa8958a9208d0c8887742a8c834fd.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48e9e858abf6f77489f3fadc4ee81edacd26705a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48e9e858abf6f77489f3fadc4ee81edacd26705a.hip deleted file mode 100644 index 3f250a5d4b4b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_48e9e858abf6f77489f3fadc4ee81edacd26705a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4904c5910a2d0595b39a3f87652a9d1ef4fcbe80.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4904c5910a2d0595b39a3f87652a9d1ef4fcbe80.hip deleted file mode 100644 index 567274c16d5b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4904c5910a2d0595b39a3f87652a9d1ef4fcbe80.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_490a68220a7b621ae9817d7b77f55de239b0a4f3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_490a68220a7b621ae9817d7b77f55de239b0a4f3.hip deleted file mode 100644 index f007a0f3896c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_490a68220a7b621ae9817d7b77f55de239b0a4f3.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4911bdd71351610d55916d452495e599960d0a41.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4911bdd71351610d55916d452495e599960d0a41.hip deleted file mode 100644 index 8810e69fc430..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4911bdd71351610d55916d452495e599960d0a41.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_492fbc418e829f89bcb8d93f8afd2869dd8dfccc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_492fbc418e829f89bcb8d93f8afd2869dd8dfccc.hip deleted file mode 100644 index ce3c284e469d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_492fbc418e829f89bcb8d93f8afd2869dd8dfccc.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49d4c005d723cdab9fbc307933c1257d114b539e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49d4c005d723cdab9fbc307933c1257d114b539e.hip deleted file mode 100644 index 3d1411a21a88..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49d4c005d723cdab9fbc307933c1257d114b539e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49f5017cc0f5c8c8dc71492e7765cf729c1f225c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49f5017cc0f5c8c8dc71492e7765cf729c1f225c.hip deleted file mode 100644 index c56084d7dfd8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_49f5017cc0f5c8c8dc71492e7765cf729c1f225c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a06b5b153ea6e8b1e20d9aad9d4633333fd98f5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a06b5b153ea6e8b1e20d9aad9d4633333fd98f5.hip deleted file mode 100644 index e56a4f3b1a79..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a06b5b153ea6e8b1e20d9aad9d4633333fd98f5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a2e6b05e7e4de2cb23d815f8b2c8adf22131c0c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a2e6b05e7e4de2cb23d815f8b2c8adf22131c0c.hip deleted file mode 100644 index 69681c5b6f73..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a2e6b05e7e4de2cb23d815f8b2c8adf22131c0c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a4a00bd6ea27ff20a2903d619e1361b5e27672a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a4a00bd6ea27ff20a2903d619e1361b5e27672a.hip deleted file mode 100644 index 53bb0bfcea13..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a4a00bd6ea27ff20a2903d619e1361b5e27672a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a5dbf601de5754c03a03a1a42395dc0766fb8ac.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a5dbf601de5754c03a03a1a42395dc0766fb8ac.hip deleted file mode 100644 index 8452a203c0a5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a5dbf601de5754c03a03a1a42395dc0766fb8ac.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a9f3da698a6103caf25d785928dd9f814ac27b4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a9f3da698a6103caf25d785928dd9f814ac27b4.hip deleted file mode 100644 index 5f38459e753c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4a9f3da698a6103caf25d785928dd9f814ac27b4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ab5d6e8fbfd92e9f7e47bda5cfbb0d4162a6319.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ab5d6e8fbfd92e9f7e47bda5cfbb0d4162a6319.hip deleted file mode 100644 index 748731a663a7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ab5d6e8fbfd92e9f7e47bda5cfbb0d4162a6319.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4afd02981f92fbef6277c1985cc479c12bae9239.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4afd02981f92fbef6277c1985cc479c12bae9239.hip deleted file mode 100644 index 55927650b050..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4afd02981f92fbef6277c1985cc479c12bae9239.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b1eaca3c37a82d19f8dc91f06764170069ca3af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b1eaca3c37a82d19f8dc91f06764170069ca3af.hip deleted file mode 100644 index 0a82ef600335..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b1eaca3c37a82d19f8dc91f06764170069ca3af.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b2e7f96b095ebfb66ecc7a75752fba2a63e4f37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b2e7f96b095ebfb66ecc7a75752fba2a63e4f37.hip deleted file mode 100644 index b69be439eabf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b2e7f96b095ebfb66ecc7a75752fba2a63e4f37.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b30f472f00bec9da0564ddc40e07112b5f9a117.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b30f472f00bec9da0564ddc40e07112b5f9a117.hip deleted file mode 100644 index 576ce59aee2d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b30f472f00bec9da0564ddc40e07112b5f9a117.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b45948f2795293e72530b02669c4f549608ea7f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b45948f2795293e72530b02669c4f549608ea7f.hip deleted file mode 100644 index 8cfb741bd114..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b45948f2795293e72530b02669c4f549608ea7f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b4c03c916393d6be7c5181369ebcef949eaa763.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b4c03c916393d6be7c5181369ebcef949eaa763.hip deleted file mode 100644 index 2e1dc8be8129..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b4c03c916393d6be7c5181369ebcef949eaa763.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b68e4d00295b294320b94bc777d7d34609127e0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b68e4d00295b294320b94bc777d7d34609127e0.hip deleted file mode 100644 index 7d0265e222e5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b68e4d00295b294320b94bc777d7d34609127e0.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b7393d55600c9892558248f4131fc06a6cf3309.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b7393d55600c9892558248f4131fc06a6cf3309.hip deleted file mode 100644 index 0288fc1a0391..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b7393d55600c9892558248f4131fc06a6cf3309.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b74439f42140cdda9bb0f78d995d741212a35f4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b74439f42140cdda9bb0f78d995d741212a35f4.hip deleted file mode 100644 index 9cd3049dd775..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b74439f42140cdda9bb0f78d995d741212a35f4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b76e5dce9af523422782dd25d8dcf6f25edc68f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b76e5dce9af523422782dd25d8dcf6f25edc68f.hip deleted file mode 100644 index 20165a311b74..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4b76e5dce9af523422782dd25d8dcf6f25edc68f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4baf664bfdf070362bcc91af77d1bc406f744351.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4baf664bfdf070362bcc91af77d1bc406f744351.hip deleted file mode 100644 index 80570a482eea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4baf664bfdf070362bcc91af77d1bc406f744351.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bc48576f285325345fa1205e5e7e01787b74f71.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bc48576f285325345fa1205e5e7e01787b74f71.hip deleted file mode 100644 index 8fff57e47c0b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bc48576f285325345fa1205e5e7e01787b74f71.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bd4d46397a3749646b232b306688e52b8c6e584.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bd4d46397a3749646b232b306688e52b8c6e584.hip deleted file mode 100644 index 8f6a8997a80c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bd4d46397a3749646b232b306688e52b8c6e584.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4be4a98f150f3f9ab6f03b5fd0968c5454565c9a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4be4a98f150f3f9ab6f03b5fd0968c5454565c9a.hip deleted file mode 100644 index c57d9dc56cde..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4be4a98f150f3f9ab6f03b5fd0968c5454565c9a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4beca56234ff6fb4f23b9b24822887fd9a3d0df9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4beca56234ff6fb4f23b9b24822887fd9a3d0df9.hip deleted file mode 100644 index ed0665c06de5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4beca56234ff6fb4f23b9b24822887fd9a3d0df9.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bef4d120e71bfcfe61d67aa44d24ceb907c2b9e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bef4d120e71bfcfe61d67aa44d24ceb907c2b9e.hip deleted file mode 100644 index ae14e6439950..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4bef4d120e71bfcfe61d67aa44d24ceb907c2b9e.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c0c50a1fac82d47dff2357ee3ddbfa0b2c8d487.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c0c50a1fac82d47dff2357ee3ddbfa0b2c8d487.hip deleted file mode 100644 index f42d922d6e84..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c0c50a1fac82d47dff2357ee3ddbfa0b2c8d487.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c69d06e3f32e3b6d28d3e54ad764b472741c193.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c69d06e3f32e3b6d28d3e54ad764b472741c193.hip deleted file mode 100644 index 732661f9dcde..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c69d06e3f32e3b6d28d3e54ad764b472741c193.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c8720923c3452e3aebd7b9c1b4b23f0c35d7e4f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c8720923c3452e3aebd7b9c1b4b23f0c35d7e4f.hip deleted file mode 100644 index ef2b8892fd63..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4c8720923c3452e3aebd7b9c1b4b23f0c35d7e4f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cabdafad0bf803223ba5e8f474cd59233dc48cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cabdafad0bf803223ba5e8f474cd59233dc48cb.hip deleted file mode 100644 index ac3556c259a5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cabdafad0bf803223ba5e8f474cd59233dc48cb.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cb1861e31df98bdfd731efc3d335055090d83af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cb1861e31df98bdfd731efc3d335055090d83af.hip deleted file mode 100644 index cc1405c6e9f9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cb1861e31df98bdfd731efc3d335055090d83af.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cd3de43cc1f7588d62a10362f59d113ee818846.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cd3de43cc1f7588d62a10362f59d113ee818846.hip deleted file mode 100644 index 3e7751fbfb2e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4cd3de43cc1f7588d62a10362f59d113ee818846.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce03571f1d2779bdeaf0a6a2d617e236d191c11.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce03571f1d2779bdeaf0a6a2d617e236d191c11.hip deleted file mode 100644 index 09a279e493ad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce03571f1d2779bdeaf0a6a2d617e236d191c11.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce671f5defd76ca08614a7a1f184c36c0f1e2ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce671f5defd76ca08614a7a1f184c36c0f1e2ab.hip deleted file mode 100644 index d2b3f1a43266..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ce671f5defd76ca08614a7a1f184c36c0f1e2ab.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d3b1ae63e127b6e6afe39e354d4995afc5faeaf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d3b1ae63e127b6e6afe39e354d4995afc5faeaf.hip deleted file mode 100644 index 426db612bc3c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d3b1ae63e127b6e6afe39e354d4995afc5faeaf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d5f3cf0f78f73df79665c26b20b0805615e1b04.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d5f3cf0f78f73df79665c26b20b0805615e1b04.hip deleted file mode 100644 index 286759ba9eed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d5f3cf0f78f73df79665c26b20b0805615e1b04.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d65e58c9f147498ed04dd51fe1393770603a6d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d65e58c9f147498ed04dd51fe1393770603a6d3.hip deleted file mode 100644 index ca68c918803a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d65e58c9f147498ed04dd51fe1393770603a6d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d7dc0f356b630179916f8fc2041b7f1402b46df.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d7dc0f356b630179916f8fc2041b7f1402b46df.hip deleted file mode 100644 index 6e0b8bc9a5e1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4d7dc0f356b630179916f8fc2041b7f1402b46df.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4da9e9b7277bc90518ab92860bef2097ba96d982.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4da9e9b7277bc90518ab92860bef2097ba96d982.hip deleted file mode 100644 index 0650d8a40cc3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4da9e9b7277bc90518ab92860bef2097ba96d982.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4db2e63cfebcf84043f79be0321708cd159c62b9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4db2e63cfebcf84043f79be0321708cd159c62b9.hip deleted file mode 100644 index 1ef804b81fa4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4db2e63cfebcf84043f79be0321708cd159c62b9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dbdd9c3f496a27bde68cf86374999ff2dd53505.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dbdd9c3f496a27bde68cf86374999ff2dd53505.hip deleted file mode 100644 index fb665d175a89..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dbdd9c3f496a27bde68cf86374999ff2dd53505.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dc87b7d385e7b092e4706c464217b004fd8a6a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dc87b7d385e7b092e4706c464217b004fd8a6a4.hip deleted file mode 100644 index ab0dc635c08b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dc87b7d385e7b092e4706c464217b004fd8a6a4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dde56efe17f4fd36a11cc959320a5e43f1dc232.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dde56efe17f4fd36a11cc959320a5e43f1dc232.hip deleted file mode 100644 index 1344cf44f5ec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4dde56efe17f4fd36a11cc959320a5e43f1dc232.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e0a88ccef04e81b8c684b695f7cb4310e448915.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e0a88ccef04e81b8c684b695f7cb4310e448915.hip deleted file mode 100644 index b27ec784fadd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e0a88ccef04e81b8c684b695f7cb4310e448915.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e15e4f16de26068cba30ef12fc29332d45e460e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e15e4f16de26068cba30ef12fc29332d45e460e.hip deleted file mode 100644 index b1c96c252377..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e15e4f16de26068cba30ef12fc29332d45e460e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e47f8fa40332c6ed12d9971e0b539049a871c34.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e47f8fa40332c6ed12d9971e0b539049a871c34.hip deleted file mode 100644 index 96038acc90a6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e47f8fa40332c6ed12d9971e0b539049a871c34.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e760de14b71a41882ec4a2c7362565af36d1a5d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e760de14b71a41882ec4a2c7362565af36d1a5d.hip deleted file mode 100644 index 387c14523322..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e760de14b71a41882ec4a2c7362565af36d1a5d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e79dce18e49ffe024fe4cd0693ad3399f5edaee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e79dce18e49ffe024fe4cd0693ad3399f5edaee.hip deleted file mode 100644 index 89eaeee1488e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e79dce18e49ffe024fe4cd0693ad3399f5edaee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e9a933b916285d9580a76df543cfafc88a536cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e9a933b916285d9580a76df543cfafc88a536cb.hip deleted file mode 100644 index 40933938adbf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4e9a933b916285d9580a76df543cfafc88a536cb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ec2075f394acfb14fae7b1ef4304fd9b654ba0d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ec2075f394acfb14fae7b1ef4304fd9b654ba0d.hip deleted file mode 100644 index 05471d838e42..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ec2075f394acfb14fae7b1ef4304fd9b654ba0d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ed6da5357b67cc28aee4afa9523adaf055c4e32.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ed6da5357b67cc28aee4afa9523adaf055c4e32.hip deleted file mode 100644 index f32f9f82806d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ed6da5357b67cc28aee4afa9523adaf055c4e32.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ef35d82ceb4af2e07719c16109c6d72eaedce67.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ef35d82ceb4af2e07719c16109c6d72eaedce67.hip deleted file mode 100644 index 72f9cb4d3fa2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ef35d82ceb4af2e07719c16109c6d72eaedce67.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f0aded9d1baec3125ce8e176248cb146ca580fa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f0aded9d1baec3125ce8e176248cb146ca580fa.hip deleted file mode 100644 index 7d43d59aff23..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f0aded9d1baec3125ce8e176248cb146ca580fa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f1e1c969b57659e7e1367ac9ba10ed5ef5b69a9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f1e1c969b57659e7e1367ac9ba10ed5ef5b69a9.hip deleted file mode 100644 index 234fd820a79d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f1e1c969b57659e7e1367ac9ba10ed5ef5b69a9.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f44435491aa68acb3217b0e693232c67641a2db.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f44435491aa68acb3217b0e693232c67641a2db.hip deleted file mode 100644 index 457e6642d3d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f44435491aa68acb3217b0e693232c67641a2db.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f4a5d56721bb1a1332a65882132a8c5763932ec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f4a5d56721bb1a1332a65882132a8c5763932ec.hip deleted file mode 100644 index 6c62c0f7a950..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f4a5d56721bb1a1332a65882132a8c5763932ec.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f6243c6850c0a2d2b7bf1476e12f95f187257b6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f6243c6850c0a2d2b7bf1476e12f95f187257b6.hip deleted file mode 100644 index 12179b6c2b15..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4f6243c6850c0a2d2b7bf1476e12f95f187257b6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa4d21931b9afcbd70b1567995d3eeb6f9308aa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa4d21931b9afcbd70b1567995d3eeb6f9308aa.hip deleted file mode 100644 index 2af9bee76393..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa4d21931b9afcbd70b1567995d3eeb6f9308aa.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa883a36a76edb276a66c5d779294f170d6d4b7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa883a36a76edb276a66c5d779294f170d6d4b7.hip deleted file mode 100644 index f6fbbbc4bd8e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fa883a36a76edb276a66c5d779294f170d6d4b7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fd34faa8b168e2ac7862641229e6146d3e28aee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fd34faa8b168e2ac7862641229e6146d3e28aee.hip deleted file mode 100644 index 0740c732d38c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fd34faa8b168e2ac7862641229e6146d3e28aee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fe530cbf6363a8f08a94728e45e88ecde299e7b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fe530cbf6363a8f08a94728e45e88ecde299e7b.hip deleted file mode 100644 index 302832389a7b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4fe530cbf6363a8f08a94728e45e88ecde299e7b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ff20bafbf156fe8fb80bdd84a5d2f3a4a944c1a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ff20bafbf156fe8fb80bdd84a5d2f3a4a944c1a.hip deleted file mode 100644 index 4c25e48a3586..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_4ff20bafbf156fe8fb80bdd84a5d2f3a4a944c1a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_501dcf3213efd214cc2ce8c9ba0027f991d241b4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_501dcf3213efd214cc2ce8c9ba0027f991d241b4.hip deleted file mode 100644 index 982d4160835f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_501dcf3213efd214cc2ce8c9ba0027f991d241b4.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5052b2318dbb78b1a82ef03666a35a623f44481b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5052b2318dbb78b1a82ef03666a35a623f44481b.hip deleted file mode 100644 index ffe1348e1338..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5052b2318dbb78b1a82ef03666a35a623f44481b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5093976cb7b32a8bd28ce92fc13af00a3e21f737.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5093976cb7b32a8bd28ce92fc13af00a3e21f737.hip deleted file mode 100644 index 309bbeb200b2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5093976cb7b32a8bd28ce92fc13af00a3e21f737.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e59bd079f4d205b613056f975fd2b4e372ab10.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e59bd079f4d205b613056f975fd2b4e372ab10.hip deleted file mode 100644 index 4e6d5a27f3d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e59bd079f4d205b613056f975fd2b4e372ab10.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e7b11019fc2299d70869253877319b03388244.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e7b11019fc2299d70869253877319b03388244.hip deleted file mode 100644 index 1a0c62228b8f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50e7b11019fc2299d70869253877319b03388244.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f887556a3540609649744957651ca667b91774.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f887556a3540609649744957651ca667b91774.hip deleted file mode 100644 index 450980c0f2d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f887556a3540609649744957651ca667b91774.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f915b4d9bd18a3c25a85917392ea4a5e88b349.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f915b4d9bd18a3c25a85917392ea4a5e88b349.hip deleted file mode 100644 index dc127fd45ed4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_50f915b4d9bd18a3c25a85917392ea4a5e88b349.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_515128c6978449b33ce0c35b02a9e9aaad65ef7a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_515128c6978449b33ce0c35b02a9e9aaad65ef7a.hip deleted file mode 100644 index a1629b2dda38..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_515128c6978449b33ce0c35b02a9e9aaad65ef7a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_522a2a9435103ed405dc1500d31652f1d431a49d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_522a2a9435103ed405dc1500d31652f1d431a49d.hip deleted file mode 100644 index 7f183308151e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_522a2a9435103ed405dc1500d31652f1d431a49d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_523e5bf45ec5008aa3aba4773e68a78e122b2fe7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_523e5bf45ec5008aa3aba4773e68a78e122b2fe7.hip deleted file mode 100644 index e2bbb093a974..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_523e5bf45ec5008aa3aba4773e68a78e122b2fe7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52688999141a72e61322140db29043ef9f7fbc3d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52688999141a72e61322140db29043ef9f7fbc3d.hip deleted file mode 100644 index afb5c2530d76..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52688999141a72e61322140db29043ef9f7fbc3d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_526c89b7a04758b4badbf9695b316f877b8bb053.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_526c89b7a04758b4badbf9695b316f877b8bb053.hip deleted file mode 100644 index 1eecd5f5d21e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_526c89b7a04758b4badbf9695b316f877b8bb053.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_528db08068589c6e4c096054d26a2e5be63285b6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_528db08068589c6e4c096054d26a2e5be63285b6.hip deleted file mode 100644 index e996953d7831..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_528db08068589c6e4c096054d26a2e5be63285b6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a89981a05963efcea7ba5c1e967638beeebbbb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a89981a05963efcea7ba5c1e967638beeebbbb.hip deleted file mode 100644 index fe0072f7afe9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a89981a05963efcea7ba5c1e967638beeebbbb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a8a323414448c50571a334f29bc0a38919b61d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a8a323414448c50571a334f29bc0a38919b61d.hip deleted file mode 100644 index ee3261f35021..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_52a8a323414448c50571a334f29bc0a38919b61d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_532a6ffd8a21d3e98342fd401f0247f62ca4e038.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_532a6ffd8a21d3e98342fd401f0247f62ca4e038.hip deleted file mode 100644 index 3aa2b7fd9146..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_532a6ffd8a21d3e98342fd401f0247f62ca4e038.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5344427df3ae9392c4fc4c25c232196828e70648.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5344427df3ae9392c4fc4c25c232196828e70648.hip deleted file mode 100644 index 5f63c871e0f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5344427df3ae9392c4fc4c25c232196828e70648.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5382a30dcf702daae19bd6705864bfe36e09502c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5382a30dcf702daae19bd6705864bfe36e09502c.hip deleted file mode 100644 index f0758b4c0d7f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5382a30dcf702daae19bd6705864bfe36e09502c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_53bd60bd2afee49b30a583c32a45ae9f2076db08.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_53bd60bd2afee49b30a583c32a45ae9f2076db08.hip deleted file mode 100644 index 93d2fe6f26e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_53bd60bd2afee49b30a583c32a45ae9f2076db08.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5403eec1cdd216d5c4a7ba977e2ef92a0d7fcc8b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5403eec1cdd216d5c4a7ba977e2ef92a0d7fcc8b.hip deleted file mode 100644 index fa3875103b4f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5403eec1cdd216d5c4a7ba977e2ef92a0d7fcc8b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_540bd57333c6839ccf5cf2e928edb996bc60c371.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_540bd57333c6839ccf5cf2e928edb996bc60c371.hip deleted file mode 100644 index 90f0a6920e03..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_540bd57333c6839ccf5cf2e928edb996bc60c371.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_541874a7633e5713720b9d084b6d1c6715a51a17.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_541874a7633e5713720b9d084b6d1c6715a51a17.hip deleted file mode 100644 index 86e7686be859..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_541874a7633e5713720b9d084b6d1c6715a51a17.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54208a6e8c5263e38f9ffcb062564ab61d2785ff.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54208a6e8c5263e38f9ffcb062564ab61d2785ff.hip deleted file mode 100644 index 53243c8a5482..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54208a6e8c5263e38f9ffcb062564ab61d2785ff.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5435b4651a90e331fcdcf224282457e3dc038a30.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5435b4651a90e331fcdcf224282457e3dc038a30.hip deleted file mode 100644 index 9dd6274ab0de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5435b4651a90e331fcdcf224282457e3dc038a30.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54402a22ceee3b665a3f24edb98b8398c35c6f5a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54402a22ceee3b665a3f24edb98b8398c35c6f5a.hip deleted file mode 100644 index 6e28a45bcb61..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54402a22ceee3b665a3f24edb98b8398c35c6f5a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54548ad36fb92d0963893146c8db20f53cbf0c8f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54548ad36fb92d0963893146c8db20f53cbf0c8f.hip deleted file mode 100644 index b121193ace71..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54548ad36fb92d0963893146c8db20f53cbf0c8f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5467aea26852aa9a9e3dae76b906005ddf6fbae1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5467aea26852aa9a9e3dae76b906005ddf6fbae1.hip deleted file mode 100644 index a3661f00788c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5467aea26852aa9a9e3dae76b906005ddf6fbae1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_548b347672451e8391388a400d016803f4c4cf8d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_548b347672451e8391388a400d016803f4c4cf8d.hip deleted file mode 100644 index 5636d23a0525..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_548b347672451e8391388a400d016803f4c4cf8d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54940ce53998becf9bddf56df7d19894a7658168.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54940ce53998becf9bddf56df7d19894a7658168.hip deleted file mode 100644 index f1ecf895525e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54940ce53998becf9bddf56df7d19894a7658168.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_549b6956eaf678f7eb901567d1a515eddbedae5f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_549b6956eaf678f7eb901567d1a515eddbedae5f.hip deleted file mode 100644 index 3dbd87c4a277..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_549b6956eaf678f7eb901567d1a515eddbedae5f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54b6e18b10d529eb6b32d7c19c59eaefc7184376.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54b6e18b10d529eb6b32d7c19c59eaefc7184376.hip deleted file mode 100644 index 15f5ee90ef56..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54b6e18b10d529eb6b32d7c19c59eaefc7184376.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54ff49018f1c12b9fa31e523ad40b9cc162ba34d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54ff49018f1c12b9fa31e523ad40b9cc162ba34d.hip deleted file mode 100644 index 6b6225859d91..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_54ff49018f1c12b9fa31e523ad40b9cc162ba34d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_555ba79201a585bc091ccfc326fd24e851d1eecc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_555ba79201a585bc091ccfc326fd24e851d1eecc.hip deleted file mode 100644 index 8118982e01a0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_555ba79201a585bc091ccfc326fd24e851d1eecc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_556cd05288e1666f5c67fb87ad02ce660e4c589c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_556cd05288e1666f5c67fb87ad02ce660e4c589c.hip deleted file mode 100644 index dba90557d9de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_556cd05288e1666f5c67fb87ad02ce660e4c589c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55b14cf2998a61611d1de2594e926fcdc378999c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55b14cf2998a61611d1de2594e926fcdc378999c.hip deleted file mode 100644 index 57efa0fcf49b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55b14cf2998a61611d1de2594e926fcdc378999c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bd9c4f1b7a0621c67f3e964d946ce22fb2fc80.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bd9c4f1b7a0621c67f3e964d946ce22fb2fc80.hip deleted file mode 100644 index 1de9a2ed79a4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bd9c4f1b7a0621c67f3e964d946ce22fb2fc80.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bf8444c1c26b91fd490c7216f4d0f8aa0a1f1a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bf8444c1c26b91fd490c7216f4d0f8aa0a1f1a.hip deleted file mode 100644 index a80e52330209..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55bf8444c1c26b91fd490c7216f4d0f8aa0a1f1a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55cda610c235987e13232e828f8d86fa88030560.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55cda610c235987e13232e828f8d86fa88030560.hip deleted file mode 100644 index d9986a2411c3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55cda610c235987e13232e828f8d86fa88030560.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55ea83a47c6299fefa4220ed88f7a8e1dd938215.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55ea83a47c6299fefa4220ed88f7a8e1dd938215.hip deleted file mode 100644 index 09c25a93a079..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_55ea83a47c6299fefa4220ed88f7a8e1dd938215.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566b4782793c6526bfce7362efbf6bf069928b2b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566b4782793c6526bfce7362efbf6bf069928b2b.hip deleted file mode 100644 index de2ea5cc3e12..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566b4782793c6526bfce7362efbf6bf069928b2b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566e26d4969bc6bbe9b092bedab11cddb3360c0f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566e26d4969bc6bbe9b092bedab11cddb3360c0f.hip deleted file mode 100644 index 75a9d5b8f657..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_566e26d4969bc6bbe9b092bedab11cddb3360c0f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56964a17f902257aca9d08c736516a2c67d9a0e9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56964a17f902257aca9d08c736516a2c67d9a0e9.hip deleted file mode 100644 index 6a936b22f4f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56964a17f902257aca9d08c736516a2c67d9a0e9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56cc4399c5567a9495f17d54c712cc9e65e57521.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56cc4399c5567a9495f17d54c712cc9e65e57521.hip deleted file mode 100644 index 9cd9b36869ba..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56cc4399c5567a9495f17d54c712cc9e65e57521.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56de9a7dfb1201b56528740e9d8a07b62710fcaf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56de9a7dfb1201b56528740e9d8a07b62710fcaf.hip deleted file mode 100644 index 6093ec75037b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56de9a7dfb1201b56528740e9d8a07b62710fcaf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56ffe9e21362afe9c3a407c09d5de186954931a6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56ffe9e21362afe9c3a407c09d5de186954931a6.hip deleted file mode 100644 index 4d99dfe2c629..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_56ffe9e21362afe9c3a407c09d5de186954931a6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5724d91c1fd6290a6cf8d52a3801ac6b921dc7d4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5724d91c1fd6290a6cf8d52a3801ac6b921dc7d4.hip deleted file mode 100644 index b5c6505d18ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5724d91c1fd6290a6cf8d52a3801ac6b921dc7d4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_572e68bd619e118292768f0925ccf92cbfa68415.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_572e68bd619e118292768f0925ccf92cbfa68415.hip deleted file mode 100644 index 4499ce4ba7ca..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_572e68bd619e118292768f0925ccf92cbfa68415.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5732094f5917e9164ee0f973ac6ec47245a69101.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5732094f5917e9164ee0f973ac6ec47245a69101.hip deleted file mode 100644 index c97cb72598e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5732094f5917e9164ee0f973ac6ec47245a69101.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5789f267d34c9961ced63ad07ffea2c6d2911415.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5789f267d34c9961ced63ad07ffea2c6d2911415.hip deleted file mode 100644 index 641ea7b50b61..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5789f267d34c9961ced63ad07ffea2c6d2911415.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5854f09511778dd1779a839b0b194896070f69ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5854f09511778dd1779a839b0b194896070f69ad.hip deleted file mode 100644 index f7f7a1237ccb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5854f09511778dd1779a839b0b194896070f69ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58679919fcd292a2a69543de0db94e2985c9d364.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58679919fcd292a2a69543de0db94e2985c9d364.hip deleted file mode 100644 index 2a287879a957..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58679919fcd292a2a69543de0db94e2985c9d364.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58762476c7f2bb05dce92ec22c0acbeb03676746.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58762476c7f2bb05dce92ec22c0acbeb03676746.hip deleted file mode 100644 index e61443fb63ef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58762476c7f2bb05dce92ec22c0acbeb03676746.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_587fc33d02b1932235b8d152e57559060211d591.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_587fc33d02b1932235b8d152e57559060211d591.hip deleted file mode 100644 index 03286eeef005..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_587fc33d02b1932235b8d152e57559060211d591.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a784fb478ff5b3f1e2da9765a3a777efda92e3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a784fb478ff5b3f1e2da9765a3a777efda92e3.hip deleted file mode 100644 index 5a74f2a4e48b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a784fb478ff5b3f1e2da9765a3a777efda92e3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a7ab44bbd9fbc97c7805860d5f6ac81d6ae468.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a7ab44bbd9fbc97c7805860d5f6ac81d6ae468.hip deleted file mode 100644 index 98811637c874..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58a7ab44bbd9fbc97c7805860d5f6ac81d6ae468.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58eb2edc7738d8d18ac359691da261ceaaf71788.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58eb2edc7738d8d18ac359691da261ceaaf71788.hip deleted file mode 100644 index 8b747833a795..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_58eb2edc7738d8d18ac359691da261ceaaf71788.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5919133d2ed892745013b2fc5d503414cf0a4d83.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5919133d2ed892745013b2fc5d503414cf0a4d83.hip deleted file mode 100644 index e6e48e42ab67..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5919133d2ed892745013b2fc5d503414cf0a4d83.hip +++ /dev/null @@ -1,14395 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -#include - -template -float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - if(s.log_level_ > 0) - std::cout << ", " << fmha_bwd_dot_do_o_get_name_() << ", " << fmha_bwd_dq_dk_dv_get_name_() << ", " << fmha_bwd_convert_dq_get_name_() << std::flush; - return ck_tile::launch_kernel(s, - [=](const ck_tile::stream_config& s_){ fmha_bwd_dot_do_o_oneshot_(s_, a); }, - [=](const ck_tile::stream_config& s_){ fmha_bwd_dq_dk_dv_oneshot_(s_, a); }, - [=](const ck_tile::stream_config& s_){ fmha_bwd_convert_dq_oneshot_(s_, a); } - ); -} - -float fmha_bwd(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){ - float r = -1; - if(t.data_type.compare("fp16") == 0){ - if (t.hdim_q <= 32 && t.hdim_v <= 32) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - else if (t.hdim_q <= 64 && t.hdim_v <= 64) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - else if (t.hdim_q <= 128 && t.hdim_v <= 128) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - else if (t.hdim_q <= 256 && t.hdim_v <= 256) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::fp16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::fp16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - - } - else if(t.data_type.compare("bf16") == 0){ - if (t.hdim_q <= 32 && t.hdim_v <= 32) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 != 0) && (a.hdim_v % 32 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 32 == 0) && (a.hdim_v % 32 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<32, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<32, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - else if (t.hdim_q <= 64 && t.hdim_v <= 64) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 32 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 != 0) && (a.hdim_v % 64 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 64 == 0) && (a.hdim_v % 64 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<64, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<64, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - else if (t.hdim_q <= 128 && t.hdim_v <= 128) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 != 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 128 == 0) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 != 0) && (a.hdim_v % 128 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 128 == 0) && (a.hdim_v % 128 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<128, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<128, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - else if (t.hdim_q <= 256 && t.hdim_v <= 256) { - if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 != 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 16 == 0 and a.seqlen_q % 64 != 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == false) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (a.seqlen_q % 64 == 0) && (a.seqlen_k % 64 == 0) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, false, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, false, false, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type == mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::no_bias) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 != 0) && (a.hdim_v % 256 != 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, true>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, true, true, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, true, false>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == true)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, true>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, true>; - r = fmha_bwd_(s, a); - return r; - } - else if((t.is_group_mode == true) && (t.mask_type != mask_enum::no_mask) && (t.bias_type == bias_enum::alibi) && (t.has_dbias == false) && (t.has_dropout == true && t.is_store_randval == false) && - (true) && (true) && (a.hdim_q % 256 == 0) && (a.hdim_v % 256 == 0) && (t.is_deterministic == false)) { - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, true, true, false>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<256, ck_tile::bf16_t, true, ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, ck_tile::SimplifiedGenericAttentionMask, ck_tile::BlockDropoutBwd, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, true, false, false, false>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<256, ck_tile::bf16_t, true, true, false, false>; - r = fmha_bwd_(s, a); - return r; - } - - } - - } - - return r; -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5939e6610e41aff8d1ccdb66d9e84d3e48e8d379.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5939e6610e41aff8d1ccdb66d9e84d3e48e8d379.hip deleted file mode 100644 index 73d873d839fc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5939e6610e41aff8d1ccdb66d9e84d3e48e8d379.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_594929c433b049a8cf949ff476309a8faf5c25fb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_594929c433b049a8cf949ff476309a8faf5c25fb.hip deleted file mode 100644 index f16363e30228..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_594929c433b049a8cf949ff476309a8faf5c25fb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_597a0276ec419f18f060a5186e6bb703ae434ac8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_597a0276ec419f18f060a5186e6bb703ae434ac8.hip deleted file mode 100644 index 86a32dbb5a02..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_597a0276ec419f18f060a5186e6bb703ae434ac8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59901147b7188212b8d8feea15831a11425fe4b3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59901147b7188212b8d8feea15831a11425fe4b3.hip deleted file mode 100644 index 6775ca1db228..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59901147b7188212b8d8feea15831a11425fe4b3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59beb9cb4e161f9dcff79080149076488d436301.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59beb9cb4e161f9dcff79080149076488d436301.hip deleted file mode 100644 index 182e1c39d90e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59beb9cb4e161f9dcff79080149076488d436301.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59d366421e0b51c90fa53c366d47ed8d51b3a329.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59d366421e0b51c90fa53c366d47ed8d51b3a329.hip deleted file mode 100644 index e26d9f840768..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_59d366421e0b51c90fa53c366d47ed8d51b3a329.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a05b4e7782bd0e29ca9f6d33fc59d4304136d41.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a05b4e7782bd0e29ca9f6d33fc59d4304136d41.hip deleted file mode 100644 index 12d2e95d12f5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a05b4e7782bd0e29ca9f6d33fc59d4304136d41.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a216f777feec4752f5882677b18168225da4b53.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a216f777feec4752f5882677b18168225da4b53.hip deleted file mode 100644 index ecf6c4e54e00..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a216f777feec4752f5882677b18168225da4b53.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a29b93cee012c79d4364502f1d90f947c73641d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a29b93cee012c79d4364502f1d90f947c73641d.hip deleted file mode 100644 index b46adb31b3bd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a29b93cee012c79d4364502f1d90f947c73641d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a85ae0a16e4b293b549bcb6a3ee52df7fccca32.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a85ae0a16e4b293b549bcb6a3ee52df7fccca32.hip deleted file mode 100644 index f8cb44c15e5f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5a85ae0a16e4b293b549bcb6a3ee52df7fccca32.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5aba1183efe205af38e79a1b2dccea5fa515d02e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5aba1183efe205af38e79a1b2dccea5fa515d02e.hip deleted file mode 100644 index b912bf7fa549..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5aba1183efe205af38e79a1b2dccea5fa515d02e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ace1c9b00f160a17355d4583d49c47887ac33c8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ace1c9b00f160a17355d4583d49c47887ac33c8.hip deleted file mode 100644 index cfa07b8966f7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ace1c9b00f160a17355d4583d49c47887ac33c8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5af96b404feac271dac8f4190180754480d3ba80.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5af96b404feac271dac8f4190180754480d3ba80.hip deleted file mode 100644 index 379c0917decf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5af96b404feac271dac8f4190180754480d3ba80.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b413bdc825ae863d53dab548f2145dc0de8fd37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b413bdc825ae863d53dab548f2145dc0de8fd37.hip deleted file mode 100644 index 3ce072783e28..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b413bdc825ae863d53dab548f2145dc0de8fd37.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b55946ff3c15a44b9c741e9f6bbbcb5bd4c8577.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b55946ff3c15a44b9c741e9f6bbbcb5bd4c8577.hip deleted file mode 100644 index 60b33e6de23e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b55946ff3c15a44b9c741e9f6bbbcb5bd4c8577.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b7a4ea3bb8905a22ae97a94c354b1cbe38093bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b7a4ea3bb8905a22ae97a94c354b1cbe38093bb.hip deleted file mode 100644 index 07aac70d3566..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5b7a4ea3bb8905a22ae97a94c354b1cbe38093bb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ba578c0e7abf1127dd0370f06d7278656c93ab9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ba578c0e7abf1127dd0370f06d7278656c93ab9.hip deleted file mode 100644 index c69568dada24..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ba578c0e7abf1127dd0370f06d7278656c93ab9.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bc803342862aa30e23e5be7d84e611bc571c529.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bc803342862aa30e23e5be7d84e611bc571c529.hip deleted file mode 100644 index fc9ffd7c9b56..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bc803342862aa30e23e5be7d84e611bc571c529.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5be9ed84ad9be1627db7a66af9370679816c0897.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5be9ed84ad9be1627db7a66af9370679816c0897.hip deleted file mode 100644 index ebf874826946..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5be9ed84ad9be1627db7a66af9370679816c0897.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bead6be6e39ece0e5d44335083336f7f546d2f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bead6be6e39ece0e5d44335083336f7f546d2f8.hip deleted file mode 100644 index 9ab599a95e0e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5bead6be6e39ece0e5d44335083336f7f546d2f8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c36fc744dfb0d985c9113175e76c7ec1c935054.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c36fc744dfb0d985c9113175e76c7ec1c935054.hip deleted file mode 100644 index 949ebb005486..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c36fc744dfb0d985c9113175e76c7ec1c935054.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c742b9ac6749f189d597ac97d46d35189472c50.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c742b9ac6749f189d597ac97d46d35189472c50.hip deleted file mode 100644 index 1ede6dd6ad3b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5c742b9ac6749f189d597ac97d46d35189472c50.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd03e29403ad53d6d52e5e81182ea6ff5aff2be.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd03e29403ad53d6d52e5e81182ea6ff5aff2be.hip deleted file mode 100644 index b2babf12db62..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd03e29403ad53d6d52e5e81182ea6ff5aff2be.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd41b6f578f3c903eb9d58ebfab62eb296044e0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd41b6f578f3c903eb9d58ebfab62eb296044e0.hip deleted file mode 100644 index 9ce2621ee477..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5cd41b6f578f3c903eb9d58ebfab62eb296044e0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d707d065ae152450f9def619ddc3dddb9089e88.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d707d065ae152450f9def619ddc3dddb9089e88.hip deleted file mode 100644 index 37d96cb19303..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d707d065ae152450f9def619ddc3dddb9089e88.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d7ed4c885fb32a0b548186e56d64bab98071d30.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d7ed4c885fb32a0b548186e56d64bab98071d30.hip deleted file mode 100644 index 3dcfaca1089a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5d7ed4c885fb32a0b548186e56d64bab98071d30.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5daedab8931f2eefb649b91e80145cb71b63360c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5daedab8931f2eefb649b91e80145cb71b63360c.hip deleted file mode 100644 index 264123892a53..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5daedab8931f2eefb649b91e80145cb71b63360c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5de27c4081377f59363c2bf2ea8624217566d2d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5de27c4081377f59363c2bf2ea8624217566d2d3.hip deleted file mode 100644 index 295219810867..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5de27c4081377f59363c2bf2ea8624217566d2d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e0abf4e2b6be3e2c555c2134705b9dcaee617ce.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e0abf4e2b6be3e2c555c2134705b9dcaee617ce.hip deleted file mode 100644 index 3933d8ee07fa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e0abf4e2b6be3e2c555c2134705b9dcaee617ce.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e62968de58d9df7d687d671f37d63393f189321.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e62968de58d9df7d687d671f37d63393f189321.hip deleted file mode 100644 index 1b6a40496223..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e62968de58d9df7d687d671f37d63393f189321.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e735b12d130ebf849ac5d6752e413ecf3e69fbf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e735b12d130ebf849ac5d6752e413ecf3e69fbf.hip deleted file mode 100644 index 1579ca0b58a4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e735b12d130ebf849ac5d6752e413ecf3e69fbf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e840be0741afa4d41fd4789c8300223fdc63ddc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e840be0741afa4d41fd4789c8300223fdc63ddc.hip deleted file mode 100644 index 9b9d4796ee6a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5e840be0741afa4d41fd4789c8300223fdc63ddc.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ea53f7c6370845fa94aa9b395c52fd1900b62de.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ea53f7c6370845fa94aa9b395c52fd1900b62de.hip deleted file mode 100644 index 8bf88c68fbf9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5ea53f7c6370845fa94aa9b395c52fd1900b62de.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5efe77ca5c394a60af0313072cdd132216a52bf3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5efe77ca5c394a60af0313072cdd132216a52bf3.hip deleted file mode 100644 index 965e6030c662..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5efe77ca5c394a60af0313072cdd132216a52bf3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f20263fd84776f155519b3481be5e2c5b035585.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f20263fd84776f155519b3481be5e2c5b035585.hip deleted file mode 100644 index 37fca282cd7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f20263fd84776f155519b3481be5e2c5b035585.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f3c3bed2b584ea2031debf9f953f5f8f7012171.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f3c3bed2b584ea2031debf9f953f5f8f7012171.hip deleted file mode 100644 index 55ff1b31f449..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f3c3bed2b584ea2031debf9f953f5f8f7012171.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f71e663978dbcba859c5114ec675a712e343fd6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f71e663978dbcba859c5114ec675a712e343fd6.hip deleted file mode 100644 index c7b60899dd41..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f71e663978dbcba859c5114ec675a712e343fd6.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f8925f929a5b26f3544ca31938aa75b3c59d34d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f8925f929a5b26f3544ca31938aa75b3c59d34d.hip deleted file mode 100644 index 9be7a392e356..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f8925f929a5b26f3544ca31938aa75b3c59d34d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f954a393b7b5a7131c13d0c4578443f468a738d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f954a393b7b5a7131c13d0c4578443f468a738d.hip deleted file mode 100644 index 8ea46759a796..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5f954a393b7b5a7131c13d0c4578443f468a738d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa19223cf296d7fd10e15e2571e63c84a80fbb1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa19223cf296d7fd10e15e2571e63c84a80fbb1.hip deleted file mode 100644 index cd1bd1a76308..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa19223cf296d7fd10e15e2571e63c84a80fbb1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa7fafd4227918e0c7f0c6ca3b2bd673cd07279.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa7fafd4227918e0c7f0c6ca3b2bd673cd07279.hip deleted file mode 100644 index 506f74bc003e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fa7fafd4227918e0c7f0c6ca3b2bd673cd07279.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fb062527121e627871b3f1b2a94b96c42e51205.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fb062527121e627871b3f1b2a94b96c42e51205.hip deleted file mode 100644 index c69d06667853..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fb062527121e627871b3f1b2a94b96c42e51205.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fc66c5b53f83bf1e023e81e9d51f0285b3ae731.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fc66c5b53f83bf1e023e81e9d51f0285b3ae731.hip deleted file mode 100644 index 8d679b5a7a1b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_5fc66c5b53f83bf1e023e81e9d51f0285b3ae731.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6018ab272d7306689c7dc5a6d5326efea1471235.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6018ab272d7306689c7dc5a6d5326efea1471235.hip deleted file mode 100644 index 07046a15a25e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6018ab272d7306689c7dc5a6d5326efea1471235.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6049c01db99fce654e9351e711b113cf7424550a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6049c01db99fce654e9351e711b113cf7424550a.hip deleted file mode 100644 index 16ed12be5487..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6049c01db99fce654e9351e711b113cf7424550a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_606f5e0b99814b0a82a731de36f28024bc317801.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_606f5e0b99814b0a82a731de36f28024bc317801.hip deleted file mode 100644 index 67d6aa71259b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_606f5e0b99814b0a82a731de36f28024bc317801.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60801d21c14796c08377349ec86a6c800af497b7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60801d21c14796c08377349ec86a6c800af497b7.hip deleted file mode 100644 index dbe5a640a83a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60801d21c14796c08377349ec86a6c800af497b7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6082d55544b5280b49b071ea277fb1827193fa2a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6082d55544b5280b49b071ea277fb1827193fa2a.hip deleted file mode 100644 index 65998fe6898b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6082d55544b5280b49b071ea277fb1827193fa2a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609616f72bf16a060fa50091ac139ddc06bf9d88.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609616f72bf16a060fa50091ac139ddc06bf9d88.hip deleted file mode 100644 index 2f80789150a9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609616f72bf16a060fa50091ac139ddc06bf9d88.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609f68180582384ba81aae2b1d4a4c52dde2c68c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609f68180582384ba81aae2b1d4a4c52dde2c68c.hip deleted file mode 100644 index bf022b31497e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_609f68180582384ba81aae2b1d4a4c52dde2c68c.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60efa9c427dc278c0d1bc31189f683cd45e4d873.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60efa9c427dc278c0d1bc31189f683cd45e4d873.hip deleted file mode 100644 index cf1c43eead35..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_60efa9c427dc278c0d1bc31189f683cd45e4d873.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61204f6805d5d830aa6fca2a9b5f238ed63c3a73.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61204f6805d5d830aa6fca2a9b5f238ed63c3a73.hip deleted file mode 100644 index 95260742fc3c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61204f6805d5d830aa6fca2a9b5f238ed63c3a73.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61220f6dca850a5b5ccf1f619a267c40c37efeca.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61220f6dca850a5b5ccf1f619a267c40c37efeca.hip deleted file mode 100644 index 7ba98f65bb63..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61220f6dca850a5b5ccf1f619a267c40c37efeca.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_614a9f10ebc51bde3f580ef527c17f89489c12c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_614a9f10ebc51bde3f580ef527c17f89489c12c7.hip deleted file mode 100644 index 84b4dc89da19..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_614a9f10ebc51bde3f580ef527c17f89489c12c7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_615430cb65d8d540836c7f12b3367abd3c8e63d2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_615430cb65d8d540836c7f12b3367abd3c8e63d2.hip deleted file mode 100644 index 3c15d4330431..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_615430cb65d8d540836c7f12b3367abd3c8e63d2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_618031345ea71cc17e458eb97a559b7c94d3ae43.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_618031345ea71cc17e458eb97a559b7c94d3ae43.hip deleted file mode 100644 index 3cb600ee4026..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_618031345ea71cc17e458eb97a559b7c94d3ae43.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61896aa9e4e4d7e494c1755b1e77a08e0e264f8d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61896aa9e4e4d7e494c1755b1e77a08e0e264f8d.hip deleted file mode 100644 index 7bf0a0cee582..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61896aa9e4e4d7e494c1755b1e77a08e0e264f8d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a44ac409e914c12281f1d26e5b52d8bfd0df75.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a44ac409e914c12281f1d26e5b52d8bfd0df75.hip deleted file mode 100644 index 46d578606ff5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a44ac409e914c12281f1d26e5b52d8bfd0df75.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a9e92183ba87924e73ff0b5e25bd12d6038e69.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a9e92183ba87924e73ff0b5e25bd12d6038e69.hip deleted file mode 100644 index b9599a8a5869..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_61a9e92183ba87924e73ff0b5e25bd12d6038e69.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62048a8ae1c0096f3372b0114c15edbe813425fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62048a8ae1c0096f3372b0114c15edbe813425fd.hip deleted file mode 100644 index 1eaa08a2a3e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62048a8ae1c0096f3372b0114c15edbe813425fd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6214f820b39a8ba81e547a78ed19a909ac13221c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6214f820b39a8ba81e547a78ed19a909ac13221c.hip deleted file mode 100644 index 1b4197acb3ac..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6214f820b39a8ba81e547a78ed19a909ac13221c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_621da34ee666903307d3a09b7a032f2a70054759.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_621da34ee666903307d3a09b7a032f2a70054759.hip deleted file mode 100644 index 9f641c25b044..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_621da34ee666903307d3a09b7a032f2a70054759.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_628b28f65f19e7d1b22fb3b85b7cf3d09cd54ebc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_628b28f65f19e7d1b22fb3b85b7cf3d09cd54ebc.hip deleted file mode 100644 index fd55dcf45642..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_628b28f65f19e7d1b22fb3b85b7cf3d09cd54ebc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_629e0b97b3fece7c12504f4c8f1860d611b57269.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_629e0b97b3fece7c12504f4c8f1860d611b57269.hip deleted file mode 100644 index 242df26fd3c7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_629e0b97b3fece7c12504f4c8f1860d611b57269.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ab710e4acc711430745e05e036dd6a4d6bcdca.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ab710e4acc711430745e05e036dd6a4d6bcdca.hip deleted file mode 100644 index 05bdb5633063..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ab710e4acc711430745e05e036dd6a4d6bcdca.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ba7a5a0f3a714eb5f9f2af20f7bfbc82a30350.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ba7a5a0f3a714eb5f9f2af20f7bfbc82a30350.hip deleted file mode 100644 index 615213a2ee39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62ba7a5a0f3a714eb5f9f2af20f7bfbc82a30350.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62eb2f81e73d65fddce7ff43c397da6529317607.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62eb2f81e73d65fddce7ff43c397da6529317607.hip deleted file mode 100644 index e3790b15bb31..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_62eb2f81e73d65fddce7ff43c397da6529317607.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_634d530731c7ade2c7beecfd1bbbca8583032217.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_634d530731c7ade2c7beecfd1bbbca8583032217.hip deleted file mode 100644 index 94a5af87cd13..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_634d530731c7ade2c7beecfd1bbbca8583032217.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6360621af3f7e1e81a8be48fea8d2750fdecbbf4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6360621af3f7e1e81a8be48fea8d2750fdecbbf4.hip deleted file mode 100644 index c0f0585f0085..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6360621af3f7e1e81a8be48fea8d2750fdecbbf4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6376eb68c550b50b9aea42a7a2cc3bda186b0e40.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6376eb68c550b50b9aea42a7a2cc3bda186b0e40.hip deleted file mode 100644 index c01596dd82b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6376eb68c550b50b9aea42a7a2cc3bda186b0e40.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63c411351ec59bdbed2590c599f9eddf7807b371.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63c411351ec59bdbed2590c599f9eddf7807b371.hip deleted file mode 100644 index 962d0ef7dd57..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63c411351ec59bdbed2590c599f9eddf7807b371.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63f121a3c8928c10a2d86b487cd13fa995da670d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63f121a3c8928c10a2d86b487cd13fa995da670d.hip deleted file mode 100644 index 4072a55be331..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_63f121a3c8928c10a2d86b487cd13fa995da670d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_643b3798f11997d33ccb58d90ed6c10d5411b735.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_643b3798f11997d33ccb58d90ed6c10d5411b735.hip deleted file mode 100644 index 7723199302e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_643b3798f11997d33ccb58d90ed6c10d5411b735.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_649336d59a8b35919e593217b6fd4314a04ea359.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_649336d59a8b35919e593217b6fd4314a04ea359.hip deleted file mode 100644 index 36e3e7e23a89..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_649336d59a8b35919e593217b6fd4314a04ea359.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64a0ca185449a49fa485892fde6af745ba758167.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64a0ca185449a49fa485892fde6af745ba758167.hip deleted file mode 100644 index 252b2a476d3a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64a0ca185449a49fa485892fde6af745ba758167.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64b3488ddf3bb1a4870371882f0a5d267bdfdf73.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64b3488ddf3bb1a4870371882f0a5d267bdfdf73.hip deleted file mode 100644 index a85575d7ed58..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64b3488ddf3bb1a4870371882f0a5d267bdfdf73.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64c3c1e3dac623f07c2dc1b934ccb868cafcb38c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64c3c1e3dac623f07c2dc1b934ccb868cafcb38c.hip deleted file mode 100644 index 35f9729777d9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64c3c1e3dac623f07c2dc1b934ccb868cafcb38c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64cf03c0aa3f1b2a7b76b4e3418eb5063b982a29.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64cf03c0aa3f1b2a7b76b4e3418eb5063b982a29.hip deleted file mode 100644 index 0c04e1c6d854..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64cf03c0aa3f1b2a7b76b4e3418eb5063b982a29.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64fe2db75cb20428856b02cd1cc8d7b393a6ad9c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64fe2db75cb20428856b02cd1cc8d7b393a6ad9c.hip deleted file mode 100644 index ffc7421e3c22..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_64fe2db75cb20428856b02cd1cc8d7b393a6ad9c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65794d9c185b21f59274ac5d4db10a7abc0be968.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65794d9c185b21f59274ac5d4db10a7abc0be968.hip deleted file mode 100644 index 81ca0165e7d8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65794d9c185b21f59274ac5d4db10a7abc0be968.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_658552954505a2092662071401e135e84956c4c0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_658552954505a2092662071401e135e84956c4c0.hip deleted file mode 100644 index b02514fd7dd1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_658552954505a2092662071401e135e84956c4c0.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65910c8b7a30acc731948ab58467fdbe4fe32f6d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65910c8b7a30acc731948ab58467fdbe4fe32f6d.hip deleted file mode 100644 index 7fcb1c9f007f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_65910c8b7a30acc731948ab58467fdbe4fe32f6d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661b49505cfecbe4ec3e5c7371de3aaaa85ac9d5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661b49505cfecbe4ec3e5c7371de3aaaa85ac9d5.hip deleted file mode 100644 index 74a4bb85dad7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661b49505cfecbe4ec3e5c7371de3aaaa85ac9d5.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661ffaf653085dd7f122d603bb3ba4b001e5f3c0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661ffaf653085dd7f122d603bb3ba4b001e5f3c0.hip deleted file mode 100644 index 13028377a8f0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_661ffaf653085dd7f122d603bb3ba4b001e5f3c0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_662767e588220d0dc6137b00cc1d8dcc91e97134.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_662767e588220d0dc6137b00cc1d8dcc91e97134.hip deleted file mode 100644 index 0493f71435c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_662767e588220d0dc6137b00cc1d8dcc91e97134.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6649f19deeaea20663bee781af7edced7f7a4fc0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6649f19deeaea20663bee781af7edced7f7a4fc0.hip deleted file mode 100644 index cc15c8652462..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6649f19deeaea20663bee781af7edced7f7a4fc0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66968bbf7e210911fcb95ba90c79837230ab1ce3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66968bbf7e210911fcb95ba90c79837230ab1ce3.hip deleted file mode 100644 index 38bd1c2abfa2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66968bbf7e210911fcb95ba90c79837230ab1ce3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66a020f728df204ff51e37d2ddc21afb0aad5e7b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66a020f728df204ff51e37d2ddc21afb0aad5e7b.hip deleted file mode 100644 index 59293e5666f9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66a020f728df204ff51e37d2ddc21afb0aad5e7b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66be70b088b20fc8de464167c35745461ddab640.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66be70b088b20fc8de464167c35745461ddab640.hip deleted file mode 100644 index 9040582b39f9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66be70b088b20fc8de464167c35745461ddab640.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66f651d3415562206c1049b172261fddba01ea6c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66f651d3415562206c1049b172261fddba01ea6c.hip deleted file mode 100644 index 6426bba66834..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_66f651d3415562206c1049b172261fddba01ea6c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_671828f15eec2a58be23063a1a8132d337cd26de.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_671828f15eec2a58be23063a1a8132d337cd26de.hip deleted file mode 100644 index 6ef9fdb78fb2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_671828f15eec2a58be23063a1a8132d337cd26de.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6767cce35ab784aa42ebcb75af7305bc38a8721a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6767cce35ab784aa42ebcb75af7305bc38a8721a.hip deleted file mode 100644 index 7931294f2ed5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6767cce35ab784aa42ebcb75af7305bc38a8721a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6785dcec0197fdbb50124ab06efa627f1a2c0567.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6785dcec0197fdbb50124ab06efa627f1a2c0567.hip deleted file mode 100644 index 502e487f780d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6785dcec0197fdbb50124ab06efa627f1a2c0567.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_678a4a8210a972bb2ed89d6ac754fb79438ab2da.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_678a4a8210a972bb2ed89d6ac754fb79438ab2da.hip deleted file mode 100644 index 85a609d3e40c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_678a4a8210a972bb2ed89d6ac754fb79438ab2da.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_67fb736c61088b8dd92fe0371f5c98e23bf9077f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_67fb736c61088b8dd92fe0371f5c98e23bf9077f.hip deleted file mode 100644 index 2bd231998817..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_67fb736c61088b8dd92fe0371f5c98e23bf9077f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_680e81c3700f130df142c9a37a368944ca548721.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_680e81c3700f130df142c9a37a368944ca548721.hip deleted file mode 100644 index 675513aee0b6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_680e81c3700f130df142c9a37a368944ca548721.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_683e8a33fdb7053760c9c135002b0a94facbe015.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_683e8a33fdb7053760c9c135002b0a94facbe015.hip deleted file mode 100644 index 08a203a63e56..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_683e8a33fdb7053760c9c135002b0a94facbe015.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_687f4aaafd1a5b9ee85aadc6fab79ad0c27a2ea2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_687f4aaafd1a5b9ee85aadc6fab79ad0c27a2ea2.hip deleted file mode 100644 index c14f7ad41ed1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_687f4aaafd1a5b9ee85aadc6fab79ad0c27a2ea2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_688aaa193f332ed13e017e78ec07a7c80e45f6c5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_688aaa193f332ed13e017e78ec07a7c80e45f6c5.hip deleted file mode 100644 index 0bd9a1cc08de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_688aaa193f332ed13e017e78ec07a7c80e45f6c5.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6905ba47078abd7a5b6a51eb93b26095517e7f70.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6905ba47078abd7a5b6a51eb93b26095517e7f70.hip deleted file mode 100644 index 204b0185fb12..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6905ba47078abd7a5b6a51eb93b26095517e7f70.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69214eb450c3b249017480efb8d092b0edad6dc3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69214eb450c3b249017480efb8d092b0edad6dc3.hip deleted file mode 100644 index 7c8b6afe1185..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69214eb450c3b249017480efb8d092b0edad6dc3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6979ef43adffdb62100270a62706fb811963925a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6979ef43adffdb62100270a62706fb811963925a.hip deleted file mode 100644 index df832120ac3b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6979ef43adffdb62100270a62706fb811963925a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69cbe8eca7e3510f5caa7f13419cfbefbf031754.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69cbe8eca7e3510f5caa7f13419cfbefbf031754.hip deleted file mode 100644 index 0ab886765465..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_69cbe8eca7e3510f5caa7f13419cfbefbf031754.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a3f42d5c9ccdd3807e488b00f02bc6ab5d8d99a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a3f42d5c9ccdd3807e488b00f02bc6ab5d8d99a.hip deleted file mode 100644 index 510ebdb90739..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a3f42d5c9ccdd3807e488b00f02bc6ab5d8d99a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a4b6226b355bf35d4d07aaef1828091f03ad2ec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a4b6226b355bf35d4d07aaef1828091f03ad2ec.hip deleted file mode 100644 index 9a3cd25c9722..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a4b6226b355bf35d4d07aaef1828091f03ad2ec.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a66604bb15f97a56847a7c968dbe32d247cbc13.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a66604bb15f97a56847a7c968dbe32d247cbc13.hip deleted file mode 100644 index 29e2aae4eddb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a66604bb15f97a56847a7c968dbe32d247cbc13.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7b6781ffff9a42beebb4d73f0d15461ddd4479.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7b6781ffff9a42beebb4d73f0d15461ddd4479.hip deleted file mode 100644 index e8fe9491d28b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7b6781ffff9a42beebb4d73f0d15461ddd4479.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7eb3d86aa385f9ecffbc5ba10489e56856f918.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7eb3d86aa385f9ecffbc5ba10489e56856f918.hip deleted file mode 100644 index 9859dbe06e07..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a7eb3d86aa385f9ecffbc5ba10489e56856f918.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a95543aeed81adfb6d847f78212585a36122ae3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a95543aeed81adfb6d847f78212585a36122ae3.hip deleted file mode 100644 index 81ecc3b18d9e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6a95543aeed81adfb6d847f78212585a36122ae3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6abeb7b50ae6a1fc62535b9a1dabbde6f177a9d0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6abeb7b50ae6a1fc62535b9a1dabbde6f177a9d0.hip deleted file mode 100644 index 769a228a0473..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6abeb7b50ae6a1fc62535b9a1dabbde6f177a9d0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af23d1460abfe875e71f7911697c42fef0f41c5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af23d1460abfe875e71f7911697c42fef0f41c5.hip deleted file mode 100644 index fa9915dd44a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af23d1460abfe875e71f7911697c42fef0f41c5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af4c15a119e805e4407b184625f57966f8833d9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af4c15a119e805e4407b184625f57966f8833d9.hip deleted file mode 100644 index 14534470a323..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6af4c15a119e805e4407b184625f57966f8833d9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b0ef67ce0f178aa2863c4909f5bdd7f766c9b2f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b0ef67ce0f178aa2863c4909f5bdd7f766c9b2f.hip deleted file mode 100644 index 569684c72536..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b0ef67ce0f178aa2863c4909f5bdd7f766c9b2f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b638314efcc4f16aa4a6e58e6caf2fda1711519.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b638314efcc4f16aa4a6e58e6caf2fda1711519.hip deleted file mode 100644 index 750be296fb5a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6b638314efcc4f16aa4a6e58e6caf2fda1711519.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6bad2ed9f91bc1efd89ea66cd5c775fa140cf931.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6bad2ed9f91bc1efd89ea66cd5c775fa140cf931.hip deleted file mode 100644 index ad3022d6b9a3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6bad2ed9f91bc1efd89ea66cd5c775fa140cf931.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6cfb7075345704340ff33dc0ef7c04ef127f26ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6cfb7075345704340ff33dc0ef7c04ef127f26ad.hip deleted file mode 100644 index 05e91216be1a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6cfb7075345704340ff33dc0ef7c04ef127f26ad.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d07bf9c05e41dcf2416e05dab4bdde17158db76.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d07bf9c05e41dcf2416e05dab4bdde17158db76.hip deleted file mode 100644 index c4ab4808c8e0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d07bf9c05e41dcf2416e05dab4bdde17158db76.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d17b92fab5bee7717bf9aff6a6bef7cee3816e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d17b92fab5bee7717bf9aff6a6bef7cee3816e7.hip deleted file mode 100644 index b82ff8ab4983..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d17b92fab5bee7717bf9aff6a6bef7cee3816e7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d307974bdeeef95cca0d130ebb7aeb77fb1b6eb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d307974bdeeef95cca0d130ebb7aeb77fb1b6eb.hip deleted file mode 100644 index 188d6d47f9e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d307974bdeeef95cca0d130ebb7aeb77fb1b6eb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d40d762ed576832b3a752453e9881b5fe6d2650.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d40d762ed576832b3a752453e9881b5fe6d2650.hip deleted file mode 100644 index 2ddbefa0efe9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d40d762ed576832b3a752453e9881b5fe6d2650.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d470f5c6fb81032fcd7974180297d4bb2a8427d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d470f5c6fb81032fcd7974180297d4bb2a8427d.hip deleted file mode 100644 index 16d9abd5d89d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d470f5c6fb81032fcd7974180297d4bb2a8427d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d5aad18f59e47a3fa3278c7ef1a6372830c33d5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d5aad18f59e47a3fa3278c7ef1a6372830c33d5.hip deleted file mode 100644 index c56eecc5a3f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6d5aad18f59e47a3fa3278c7ef1a6372830c33d5.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6db86621d626722434f2ae9b7b8ab435a8dd8827.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6db86621d626722434f2ae9b7b8ab435a8dd8827.hip deleted file mode 100644 index c5d84434223e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6db86621d626722434f2ae9b7b8ab435a8dd8827.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6dd707cf48a17d31abef94215c5720419faa0a39.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6dd707cf48a17d31abef94215c5720419faa0a39.hip deleted file mode 100644 index c609bd55973d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6dd707cf48a17d31abef94215c5720419faa0a39.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e240106c771ebea461fc2a87b6da68e510aba70.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e240106c771ebea461fc2a87b6da68e510aba70.hip deleted file mode 100644 index a28809448249..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e240106c771ebea461fc2a87b6da68e510aba70.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e6a4475ea795935f4cbf2dc0ac156a33d754587.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e6a4475ea795935f4cbf2dc0ac156a33d754587.hip deleted file mode 100644 index 4ac93e1f60f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e6a4475ea795935f4cbf2dc0ac156a33d754587.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e7e1d245baabe2f6293e3d85318f9936b333500.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e7e1d245baabe2f6293e3d85318f9936b333500.hip deleted file mode 100644 index 3948cba12a1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e7e1d245baabe2f6293e3d85318f9936b333500.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e8cda718e10824956f0ee39bbb0891eafa45a7b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e8cda718e10824956f0ee39bbb0891eafa45a7b.hip deleted file mode 100644 index 4c427cbe2f81..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6e8cda718e10824956f0ee39bbb0891eafa45a7b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eca9cd905ea8b0454cf9564643894682b08cb97.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eca9cd905ea8b0454cf9564643894682b08cb97.hip deleted file mode 100644 index b2e27eb1a999..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eca9cd905ea8b0454cf9564643894682b08cb97.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eebd0c2fbfc85f938b10535855c388971129a28.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eebd0c2fbfc85f938b10535855c388971129a28.hip deleted file mode 100644 index 76d52c506105..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6eebd0c2fbfc85f938b10535855c388971129a28.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ef5803b33d97db72eb8a8528aeb3fc956a938cc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ef5803b33d97db72eb8a8528aeb3fc956a938cc.hip deleted file mode 100644 index 0d2092d2a3a2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ef5803b33d97db72eb8a8528aeb3fc956a938cc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f31b3345893eec8ed1ddf1d8de2512b46ff6187.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f31b3345893eec8ed1ddf1d8de2512b46ff6187.hip deleted file mode 100644 index dce7738d2c57..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f31b3345893eec8ed1ddf1d8de2512b46ff6187.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f3d098f8bb63133924aab70d26a6ed64018c13b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f3d098f8bb63133924aab70d26a6ed64018c13b.hip deleted file mode 100644 index 07a4e63d703e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f3d098f8bb63133924aab70d26a6ed64018c13b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f8788c537cbf6833c58a6ca15c0a36de33c9fbd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f8788c537cbf6833c58a6ca15c0a36de33c9fbd.hip deleted file mode 100644 index 3fe8cbefb451..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f8788c537cbf6833c58a6ca15c0a36de33c9fbd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f88527a2cdb5adf51407f4661a254bb32d7de23.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f88527a2cdb5adf51407f4661a254bb32d7de23.hip deleted file mode 100644 index 8fc6a93c12f1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6f88527a2cdb5adf51407f4661a254bb32d7de23.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6fa6478cc27e52fd9511fbff38369c921155cfb9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6fa6478cc27e52fd9511fbff38369c921155cfb9.hip deleted file mode 100644 index 26950a979caa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6fa6478cc27e52fd9511fbff38369c921155cfb9.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff4605d82507fc4bd6e96095eaee5173ea41973.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff4605d82507fc4bd6e96095eaee5173ea41973.hip deleted file mode 100644 index 9bbb2c9af813..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff4605d82507fc4bd6e96095eaee5173ea41973.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff58a5186d69efd6062f3717bd315394ea6592b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff58a5186d69efd6062f3717bd315394ea6592b.hip deleted file mode 100644 index 0c7e48f516d2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_6ff58a5186d69efd6062f3717bd315394ea6592b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_703246f1f53a988cf252eff88bdf814bd382d3ac.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_703246f1f53a988cf252eff88bdf814bd382d3ac.hip deleted file mode 100644 index f499998516fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_703246f1f53a988cf252eff88bdf814bd382d3ac.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70586668a61ab88bc46b763df8f1c2ea52001ea0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70586668a61ab88bc46b763df8f1c2ea52001ea0.hip deleted file mode 100644 index 1503096dd83c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70586668a61ab88bc46b763df8f1c2ea52001ea0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70c8e45f6ea7cf5dba9eeadd0b19481d9f5defb7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70c8e45f6ea7cf5dba9eeadd0b19481d9f5defb7.hip deleted file mode 100644 index 96230751279b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70c8e45f6ea7cf5dba9eeadd0b19481d9f5defb7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70cf755f1485c065222be4daab84283a9c3d0eb7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70cf755f1485c065222be4daab84283a9c3d0eb7.hip deleted file mode 100644 index cb4aca8f61dc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_70cf755f1485c065222be4daab84283a9c3d0eb7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_714c5369aa848021e020d874289e3ae4e0f74d77.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_714c5369aa848021e020d874289e3ae4e0f74d77.hip deleted file mode 100644 index 400cdb6a7e5e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_714c5369aa848021e020d874289e3ae4e0f74d77.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7177f939ac3dae8749cbf4232dcf04d2cf63b48f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7177f939ac3dae8749cbf4232dcf04d2cf63b48f.hip deleted file mode 100644 index a8ac0905530e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7177f939ac3dae8749cbf4232dcf04d2cf63b48f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71a2d046629a4b65c90d0e18d061c4984062f844.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71a2d046629a4b65c90d0e18d061c4984062f844.hip deleted file mode 100644 index a96e18441bd6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71a2d046629a4b65c90d0e18d061c4984062f844.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71b6100efe30d836dab557ea4ac54c4b9d35c6aa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71b6100efe30d836dab557ea4ac54c4b9d35c6aa.hip deleted file mode 100644 index 65f5edef13d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71b6100efe30d836dab557ea4ac54c4b9d35c6aa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71dcbe9f481c92215f3b636bc0e86ce8f65e6472.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71dcbe9f481c92215f3b636bc0e86ce8f65e6472.hip deleted file mode 100644 index 0a16cc4b5bbb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71dcbe9f481c92215f3b636bc0e86ce8f65e6472.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e3980331dc4bcec6ab6f4c345c7b5f71356979.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e3980331dc4bcec6ab6f4c345c7b5f71356979.hip deleted file mode 100644 index e8b3e80bf404..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e3980331dc4bcec6ab6f4c345c7b5f71356979.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e5fb3544dafa9da03fd2de4bb9bd0718f6009f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e5fb3544dafa9da03fd2de4bb9bd0718f6009f.hip deleted file mode 100644 index d408c699f71a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_71e5fb3544dafa9da03fd2de4bb9bd0718f6009f.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7237ce5f3cf13ace3efc0b0227ae5a8c1fdfce1d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7237ce5f3cf13ace3efc0b0227ae5a8c1fdfce1d.hip deleted file mode 100644 index 93bd7364e524..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7237ce5f3cf13ace3efc0b0227ae5a8c1fdfce1d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_724d1d4408196d611b2e0535bf8833652acbd6ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_724d1d4408196d611b2e0535bf8833652acbd6ef.hip deleted file mode 100644 index a94b7aa89a80..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_724d1d4408196d611b2e0535bf8833652acbd6ef.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7264e378e1ea1d4dd97f6949d66f3492883b663e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7264e378e1ea1d4dd97f6949d66f3492883b663e.hip deleted file mode 100644 index ae23481cdd20..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7264e378e1ea1d4dd97f6949d66f3492883b663e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_72abb25dba0c48b380b2dabeb6ab7efaa706d180.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_72abb25dba0c48b380b2dabeb6ab7efaa706d180.hip deleted file mode 100644 index 974602b4240e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_72abb25dba0c48b380b2dabeb6ab7efaa706d180.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7309c38fc8a2d5ad6efd449107dc54a7509624fe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7309c38fc8a2d5ad6efd449107dc54a7509624fe.hip deleted file mode 100644 index f931ad5c40d5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7309c38fc8a2d5ad6efd449107dc54a7509624fe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7344f96bed2f56793b1c2583485aa161cdf30379.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7344f96bed2f56793b1c2583485aa161cdf30379.hip deleted file mode 100644 index bbae0c6831c7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7344f96bed2f56793b1c2583485aa161cdf30379.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7393267865f1c2b0aa1a09a586f54cec98eea4ae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7393267865f1c2b0aa1a09a586f54cec98eea4ae.hip deleted file mode 100644 index a098096fb995..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7393267865f1c2b0aa1a09a586f54cec98eea4ae.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73d4901b8ef034590314048de7223a572d61ee0f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73d4901b8ef034590314048de7223a572d61ee0f.hip deleted file mode 100644 index c2f02318aab7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73d4901b8ef034590314048de7223a572d61ee0f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73ec21ed6e040260c4f04ef68ef9307aa86985a7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73ec21ed6e040260c4f04ef68ef9307aa86985a7.hip deleted file mode 100644 index 201c917ac1d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_73ec21ed6e040260c4f04ef68ef9307aa86985a7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_741401abfbbbdf0dd1d62df8bc3e85371ead71d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_741401abfbbbdf0dd1d62df8bc3e85371ead71d6.hip deleted file mode 100644 index 115428bbd3e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_741401abfbbbdf0dd1d62df8bc3e85371ead71d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_743176ecb1f0bc800c870861585edf56f88d7739.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_743176ecb1f0bc800c870861585edf56f88d7739.hip deleted file mode 100644 index 22dcbcd3fa9c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_743176ecb1f0bc800c870861585edf56f88d7739.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_744ec604c577a27e0aae5b39711a9e2eb82801b6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_744ec604c577a27e0aae5b39711a9e2eb82801b6.hip deleted file mode 100644 index a65321800a91..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_744ec604c577a27e0aae5b39711a9e2eb82801b6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_745705ae121a1a331527cedfe4d31218a428a0df.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_745705ae121a1a331527cedfe4d31218a428a0df.hip deleted file mode 100644 index 33f0dca78936..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_745705ae121a1a331527cedfe4d31218a428a0df.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_748a3d76e8ab73af9a5d2302d33e3b1d1b866dd1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_748a3d76e8ab73af9a5d2302d33e3b1d1b866dd1.hip deleted file mode 100644 index e3c3b5c9ac18..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_748a3d76e8ab73af9a5d2302d33e3b1d1b866dd1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7497eca4d1a18306b406b367653622a8d64095bf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7497eca4d1a18306b406b367653622a8d64095bf.hip deleted file mode 100644 index ee033b78fdec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7497eca4d1a18306b406b367653622a8d64095bf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74ba59d347ce8916a22b40e6f22a3c89e13db4d0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74ba59d347ce8916a22b40e6f22a3c89e13db4d0.hip deleted file mode 100644 index 4245769177b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74ba59d347ce8916a22b40e6f22a3c89e13db4d0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74d5f2aef029f2103bb419cc982cae99fd1a9253.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74d5f2aef029f2103bb419cc982cae99fd1a9253.hip deleted file mode 100644 index bfc786b112d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_74d5f2aef029f2103bb419cc982cae99fd1a9253.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7524904ac5a2040c7ea72aef5942212f291a21bf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7524904ac5a2040c7ea72aef5942212f291a21bf.hip deleted file mode 100644 index df2a57dab01b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7524904ac5a2040c7ea72aef5942212f291a21bf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_758b211174da0f398b2a093e7389905b4f9c4060.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_758b211174da0f398b2a093e7389905b4f9c4060.hip deleted file mode 100644 index 71fc61bebf4b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_758b211174da0f398b2a093e7389905b4f9c4060.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7596c14b8fee751d03f42ca48ea4f66e87fc2e2f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7596c14b8fee751d03f42ca48ea4f66e87fc2e2f.hip deleted file mode 100644 index 0276d8ab2551..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7596c14b8fee751d03f42ca48ea4f66e87fc2e2f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7597ce4d2e5264bdeda47487d5bdb55a014c6616.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7597ce4d2e5264bdeda47487d5bdb55a014c6616.hip deleted file mode 100644 index 88817c29e52b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7597ce4d2e5264bdeda47487d5bdb55a014c6616.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75a310a6eb86e3e8baac7a930c3ffbef372942b3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75a310a6eb86e3e8baac7a930c3ffbef372942b3.hip deleted file mode 100644 index ce3f7983eac1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75a310a6eb86e3e8baac7a930c3ffbef372942b3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75c38912947881caa14b3fc7ab7bca317e296dc3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75c38912947881caa14b3fc7ab7bca317e296dc3.hip deleted file mode 100644 index 83fc102c0a30..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75c38912947881caa14b3fc7ab7bca317e296dc3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f2010bf6c478d2f0eba77e912697661306c1cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f2010bf6c478d2f0eba77e912697661306c1cb.hip deleted file mode 100644 index 7028223fc648..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f2010bf6c478d2f0eba77e912697661306c1cb.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f21e38ad01fade35b1db40adabd75eb602410c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f21e38ad01fade35b1db40adabd75eb602410c.hip deleted file mode 100644 index 4ef1fb0d6a34..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_75f21e38ad01fade35b1db40adabd75eb602410c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7601e6aea44b96e94fb019501be6b102c6e6a654.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7601e6aea44b96e94fb019501be6b102c6e6a654.hip deleted file mode 100644 index a25ef56d4f8d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7601e6aea44b96e94fb019501be6b102c6e6a654.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_761bde840c0c8149b24a8f6f264e963c4e9e8ceb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_761bde840c0c8149b24a8f6f264e963c4e9e8ceb.hip deleted file mode 100644 index 5dfcc3c8d8e4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_761bde840c0c8149b24a8f6f264e963c4e9e8ceb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_765940baaaa2ae6ade43ef4c94a220eaa63702b0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_765940baaaa2ae6ade43ef4c94a220eaa63702b0.hip deleted file mode 100644 index 45dcf1543807..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_765940baaaa2ae6ade43ef4c94a220eaa63702b0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76674fc182dfa6329c73a354aa3adf458429444a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76674fc182dfa6329c73a354aa3adf458429444a.hip deleted file mode 100644 index 4d6154fa41d5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76674fc182dfa6329c73a354aa3adf458429444a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76704ca28a4877a1e84022e022614709adabb280.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76704ca28a4877a1e84022e022614709adabb280.hip deleted file mode 100644 index d561dbf7a1cd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76704ca28a4877a1e84022e022614709adabb280.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_768c80fd3ea17813df1bf19a158186834fd00780.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_768c80fd3ea17813df1bf19a158186834fd00780.hip deleted file mode 100644 index 23abb8024397..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_768c80fd3ea17813df1bf19a158186834fd00780.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76be322fc072ca19baa82707e260c6eba936ae19.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76be322fc072ca19baa82707e260c6eba936ae19.hip deleted file mode 100644 index e9a3df87b220..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76be322fc072ca19baa82707e260c6eba936ae19.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76f884e9ca116ee47b446efe9fc770c178a858d5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76f884e9ca116ee47b446efe9fc770c178a858d5.hip deleted file mode 100644 index e0ec95dc58f3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_76f884e9ca116ee47b446efe9fc770c178a858d5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_770ad1eb1b30ad8f1e7c17df486093129b2d5630.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_770ad1eb1b30ad8f1e7c17df486093129b2d5630.hip deleted file mode 100644 index 8643fee9bd74..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_770ad1eb1b30ad8f1e7c17df486093129b2d5630.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77200e875e0ef160b311c7de450c137772312d0d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77200e875e0ef160b311c7de450c137772312d0d.hip deleted file mode 100644 index 73a829e1324c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77200e875e0ef160b311c7de450c137772312d0d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_772016803aa3ca6ebe785557118365f9be7c4339.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_772016803aa3ca6ebe785557118365f9be7c4339.hip deleted file mode 100644 index 6861aa9c8540..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_772016803aa3ca6ebe785557118365f9be7c4339.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7726be8909f631c04d4395fa4ffd03a736f447f1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7726be8909f631c04d4395fa4ffd03a736f447f1.hip deleted file mode 100644 index 90de6047eabf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7726be8909f631c04d4395fa4ffd03a736f447f1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7728d5bec7941c9b6d5632bee8d67ed92b9c03ec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7728d5bec7941c9b6d5632bee8d67ed92b9c03ec.hip deleted file mode 100644 index 6eacd14761ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7728d5bec7941c9b6d5632bee8d67ed92b9c03ec.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7764814a0de7702f0b7b5ce9dede6440603f4853.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7764814a0de7702f0b7b5ce9dede6440603f4853.hip deleted file mode 100644 index 240efadcbe2f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7764814a0de7702f0b7b5ce9dede6440603f4853.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77a814291d8f01870274149b9d82fb75921d6e20.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77a814291d8f01870274149b9d82fb75921d6e20.hip deleted file mode 100644 index 2c04d2cb994c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77a814291d8f01870274149b9d82fb75921d6e20.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77d0223697ed41c4c2fd8830f8df6e5620db547f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77d0223697ed41c4c2fd8830f8df6e5620db547f.hip deleted file mode 100644 index 7c3aca0d26a5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_77d0223697ed41c4c2fd8830f8df6e5620db547f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7831ce329f2a0812ebb1dd103ea4ba8cb7ba531d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7831ce329f2a0812ebb1dd103ea4ba8cb7ba531d.hip deleted file mode 100644 index 75adc7336ae5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7831ce329f2a0812ebb1dd103ea4ba8cb7ba531d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7838849e57ee9cd292e588f587a8079b57becfc8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7838849e57ee9cd292e588f587a8079b57becfc8.hip deleted file mode 100644 index 990ab970d753..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7838849e57ee9cd292e588f587a8079b57becfc8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_783ec08544591a22f59dc12f169b7327b4185a1a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_783ec08544591a22f59dc12f169b7327b4185a1a.hip deleted file mode 100644 index cb93457d2e4a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_783ec08544591a22f59dc12f169b7327b4185a1a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_784c35fee4d372123631312f1051c43e1fa12378.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_784c35fee4d372123631312f1051c43e1fa12378.hip deleted file mode 100644 index ac6ad19275d8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_784c35fee4d372123631312f1051c43e1fa12378.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78663faeb0425f45e8a0da0f7b1a5ddbee5e07e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78663faeb0425f45e8a0da0f7b1a5ddbee5e07e7.hip deleted file mode 100644 index 629c6dc42f66..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78663faeb0425f45e8a0da0f7b1a5ddbee5e07e7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7872c45ba170f2782c4b5b75cfc78ac79a4cf157.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7872c45ba170f2782c4b5b75cfc78ac79a4cf157.hip deleted file mode 100644 index 0c76c45f5df7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7872c45ba170f2782c4b5b75cfc78ac79a4cf157.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7878e2a4d3b96a552e03d1ffc33debfd50c9f7f1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7878e2a4d3b96a552e03d1ffc33debfd50c9f7f1.hip deleted file mode 100644 index b63ad6a43b7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7878e2a4d3b96a552e03d1ffc33debfd50c9f7f1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e1edca5abe1bb3e7aa946eab6484b7bed806a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e1edca5abe1bb3e7aa946eab6484b7bed806a3.hip deleted file mode 100644 index e952938b0dbc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e1edca5abe1bb3e7aa946eab6484b7bed806a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e945db4afa1330fe3978bc1bc9ae99828ae287.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e945db4afa1330fe3978bc1bc9ae99828ae287.hip deleted file mode 100644 index ad3c80014728..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78e945db4afa1330fe3978bc1bc9ae99828ae287.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78f7e2a2c08cd87702793f91b6935cbe4c22be55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78f7e2a2c08cd87702793f91b6935cbe4c22be55.hip deleted file mode 100644 index 283fcca1f5b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_78f7e2a2c08cd87702793f91b6935cbe4c22be55.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_797750ac0b18b48f56ceb4640256e9bd3a36621a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_797750ac0b18b48f56ceb4640256e9bd3a36621a.hip deleted file mode 100644 index ad4515d6a150..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_797750ac0b18b48f56ceb4640256e9bd3a36621a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7993fc08ac5c6ce7a2eceb1227f4e3718dc4cf5f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7993fc08ac5c6ce7a2eceb1227f4e3718dc4cf5f.hip deleted file mode 100644 index fd4a42f1a403..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7993fc08ac5c6ce7a2eceb1227f4e3718dc4cf5f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79a7dce707954e765d97cb22e57d9bd6168860d9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79a7dce707954e765d97cb22e57d9bd6168860d9.hip deleted file mode 100644 index d9d093e8665f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79a7dce707954e765d97cb22e57d9bd6168860d9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79d0b8053ddf99a4d4447656d733c2da026b3a7c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79d0b8053ddf99a4d4447656d733c2da026b3a7c.hip deleted file mode 100644 index 10636d6a616a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79d0b8053ddf99a4d4447656d733c2da026b3a7c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79f182ae021e23869d7bebf2a9b4575bdc910ed0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79f182ae021e23869d7bebf2a9b4575bdc910ed0.hip deleted file mode 100644 index 457b54d3dace..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_79f182ae021e23869d7bebf2a9b4575bdc910ed0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a0ab620e6d62259a559e329460e46e6e3f7c3f9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a0ab620e6d62259a559e329460e46e6e3f7c3f9.hip deleted file mode 100644 index 74bb83256b39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a0ab620e6d62259a559e329460e46e6e3f7c3f9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a13d62a715fd717f0d4101f787349cb49cbe70f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a13d62a715fd717f0d4101f787349cb49cbe70f.hip deleted file mode 100644 index 53418d562863..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a13d62a715fd717f0d4101f787349cb49cbe70f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a242e5953f44316b6a4f6587ec26283ed6cbcae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a242e5953f44316b6a4f6587ec26283ed6cbcae.hip deleted file mode 100644 index d75e6e191f6a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a242e5953f44316b6a4f6587ec26283ed6cbcae.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a2e032f6500fbc5468183415b6dd1d3e43f0bee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a2e032f6500fbc5468183415b6dd1d3e43f0bee.hip deleted file mode 100644 index dff447dd2ce2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a2e032f6500fbc5468183415b6dd1d3e43f0bee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a890b126da2d8cfbf84f048b779cac2dd56b509.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a890b126da2d8cfbf84f048b779cac2dd56b509.hip deleted file mode 100644 index 88d05d2d8964..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a890b126da2d8cfbf84f048b779cac2dd56b509.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a902ed4ae3cc6558c73b730ff3949778007a230.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a902ed4ae3cc6558c73b730ff3949778007a230.hip deleted file mode 100644 index cae30bbc8027..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7a902ed4ae3cc6558c73b730ff3949778007a230.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7aa14aa94d625b33df1adfa30ef4d91769592608.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7aa14aa94d625b33df1adfa30ef4d91769592608.hip deleted file mode 100644 index 3138c9f5a7e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7aa14aa94d625b33df1adfa30ef4d91769592608.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ab03a62e064864e1e9c1cd506c1b2e1786a777c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ab03a62e064864e1e9c1cd506c1b2e1786a777c.hip deleted file mode 100644 index 606cca5dd072..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ab03a62e064864e1e9c1cd506c1b2e1786a777c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7adf69b51f0a8cc9ae7e250e60df38758230fe4f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7adf69b51f0a8cc9ae7e250e60df38758230fe4f.hip deleted file mode 100644 index 26a042289b47..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7adf69b51f0a8cc9ae7e250e60df38758230fe4f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7afd1a756247b15b078d15a39e350a07c22982da.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7afd1a756247b15b078d15a39e350a07c22982da.hip deleted file mode 100644 index 3ad5895b7439..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7afd1a756247b15b078d15a39e350a07c22982da.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b2d3680c3578c7292349b58843aef7a82e0087d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b2d3680c3578c7292349b58843aef7a82e0087d.hip deleted file mode 100644 index 179b17c14a7b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b2d3680c3578c7292349b58843aef7a82e0087d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b5680f97836be4a369802e8115617a83875703e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b5680f97836be4a369802e8115617a83875703e.hip deleted file mode 100644 index 55123992c1cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b5680f97836be4a369802e8115617a83875703e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b67045d438a7e4b8f3a313a5df5a85f351c1be5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b67045d438a7e4b8f3a313a5df5a85f351c1be5.hip deleted file mode 100644 index a76f03ddda75..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b67045d438a7e4b8f3a313a5df5a85f351c1be5.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b7fa76609243a8709f349ffc0d9d88157f28dc9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b7fa76609243a8709f349ffc0d9d88157f28dc9.hip deleted file mode 100644 index 60b1d61672e3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b7fa76609243a8709f349ffc0d9d88157f28dc9.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b9a3bf1a9b37e0bd9bae6249609e5994dc0dba1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b9a3bf1a9b37e0bd9bae6249609e5994dc0dba1.hip deleted file mode 100644 index 11548e314ba2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7b9a3bf1a9b37e0bd9bae6249609e5994dc0dba1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7bb7b63e8a4c1df4eac4d978e166867195bd6e53.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7bb7b63e8a4c1df4eac4d978e166867195bd6e53.hip deleted file mode 100644 index 3f0066d8e656..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7bb7b63e8a4c1df4eac4d978e166867195bd6e53.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c19fc90e5a9c422dbf529d2def286f47dea0f50.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c19fc90e5a9c422dbf529d2def286f47dea0f50.hip deleted file mode 100644 index dbac69a6daa2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c19fc90e5a9c422dbf529d2def286f47dea0f50.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c23dde1a386436e9864c8fa5f1706c0d2fbfd0d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c23dde1a386436e9864c8fa5f1706c0d2fbfd0d.hip deleted file mode 100644 index a5529a6cccae..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c23dde1a386436e9864c8fa5f1706c0d2fbfd0d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c3d8ef4da515960bf40eb1feb04d21950ad5ae5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c3d8ef4da515960bf40eb1feb04d21950ad5ae5.hip deleted file mode 100644 index d3433f804b6b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c3d8ef4da515960bf40eb1feb04d21950ad5ae5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c4710e8f4e27fae4ae079f1667c3a1879cb6da8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c4710e8f4e27fae4ae079f1667c3a1879cb6da8.hip deleted file mode 100644 index 1114a63ffb4f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7c4710e8f4e27fae4ae079f1667c3a1879cb6da8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cbe4562c51d6829ec5942e11035c452fe318b3a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cbe4562c51d6829ec5942e11035c452fe318b3a.hip deleted file mode 100644 index 75ff01bb255a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cbe4562c51d6829ec5942e11035c452fe318b3a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cdc419d4248dfdeeab1f0980aec35fa134e52e0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cdc419d4248dfdeeab1f0980aec35fa134e52e0.hip deleted file mode 100644 index b727c425b11c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7cdc419d4248dfdeeab1f0980aec35fa134e52e0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d08373ace7087bdaca4ce8b0bc329f553f88d77.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d08373ace7087bdaca4ce8b0bc329f553f88d77.hip deleted file mode 100644 index eb5e4cb69660..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d08373ace7087bdaca4ce8b0bc329f553f88d77.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d0f767c17385eb7d756cbe8ed444d7cef72dea5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d0f767c17385eb7d756cbe8ed444d7cef72dea5.hip deleted file mode 100644 index 31b5a9b567d2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d0f767c17385eb7d756cbe8ed444d7cef72dea5.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d12e9cb599d24631c082e3cf65d2c58b6d4d44f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d12e9cb599d24631c082e3cf65d2c58b6d4d44f.hip deleted file mode 100644 index f781a6ffc610..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d12e9cb599d24631c082e3cf65d2c58b6d4d44f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d2f87c021e0b6a27b2d7e30351fd50f06414b5f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d2f87c021e0b6a27b2d7e30351fd50f06414b5f.hip deleted file mode 100644 index 28b073a0087f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d2f87c021e0b6a27b2d7e30351fd50f06414b5f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d5667b27f15a06d4040354fba3601d48bb9c045.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d5667b27f15a06d4040354fba3601d48bb9c045.hip deleted file mode 100644 index a82404cc0093..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7d5667b27f15a06d4040354fba3601d48bb9c045.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dac5d4cf103d658e129673549549f1276f134e0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dac5d4cf103d658e129673549549f1276f134e0.hip deleted file mode 100644 index 2f58ff3a4fe0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dac5d4cf103d658e129673549549f1276f134e0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dd260849b86c46b685955cab54ba07d49b47954.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dd260849b86c46b685955cab54ba07d49b47954.hip deleted file mode 100644 index 5cf98e1f5e42..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dd260849b86c46b685955cab54ba07d49b47954.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ddd621da88c57798db1e689b93b692b6519ff96.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ddd621da88c57798db1e689b93b692b6519ff96.hip deleted file mode 100644 index 3a15564be098..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ddd621da88c57798db1e689b93b692b6519ff96.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dfe21ee27f8a0ca0407ef0dea73cd73ae6940db.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dfe21ee27f8a0ca0407ef0dea73cd73ae6940db.hip deleted file mode 100644 index 1ef8422004e9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7dfe21ee27f8a0ca0407ef0dea73cd73ae6940db.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e1bdde812c332c9fc58613698568a04771b9fa8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e1bdde812c332c9fc58613698568a04771b9fa8.hip deleted file mode 100644 index f0199b16708b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e1bdde812c332c9fc58613698568a04771b9fa8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e332a6aeecfb12dcf70c69157fd3137343fb9f6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e332a6aeecfb12dcf70c69157fd3137343fb9f6.hip deleted file mode 100644 index 4397946dadae..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e332a6aeecfb12dcf70c69157fd3137343fb9f6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e6129eead18d13a4a6cb9550384fddabc7a2a16.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e6129eead18d13a4a6cb9550384fddabc7a2a16.hip deleted file mode 100644 index c009b505f4f0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e6129eead18d13a4a6cb9550384fddabc7a2a16.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e89f79217037e361bb0909d06534e40f5026b4f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e89f79217037e361bb0909d06534e40f5026b4f.hip deleted file mode 100644 index 4739379775d9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e89f79217037e361bb0909d06534e40f5026b4f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9519dd0d0f940fd5efd61bd32df7528ba7e3fc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9519dd0d0f940fd5efd61bd32df7528ba7e3fc.hip deleted file mode 100644 index ea24e86a28f3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9519dd0d0f940fd5efd61bd32df7528ba7e3fc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9c7feb747241c9c7de2adf3a19933a1c4c0995.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9c7feb747241c9c7de2adf3a19933a1c4c0995.hip deleted file mode 100644 index 88cabf351126..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7e9c7feb747241c9c7de2adf3a19933a1c4c0995.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ea9c37d92e344f3cc58cd4d1d00f19167e3623e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ea9c37d92e344f3cc58cd4d1d00f19167e3623e.hip deleted file mode 100644 index 936d1899f4d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ea9c37d92e344f3cc58cd4d1d00f19167e3623e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec038393ec329a894aee9bbac078a40f57a4684.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec038393ec329a894aee9bbac078a40f57a4684.hip deleted file mode 100644 index a2324dfee77c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec038393ec329a894aee9bbac078a40f57a4684.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec04763d635c5bc3e810737b5d948c59f117d5a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec04763d635c5bc3e810737b5d948c59f117d5a.hip deleted file mode 100644 index 259c60ca13c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ec04763d635c5bc3e810737b5d948c59f117d5a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ee953cb24e28bcdc8f05783894b23cbf83bdf35.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ee953cb24e28bcdc8f05783894b23cbf83bdf35.hip deleted file mode 100644 index 117ed8c59c35..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ee953cb24e28bcdc8f05783894b23cbf83bdf35.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f6ccdb3c2d595fffd05bc5e6417b157276547fb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f6ccdb3c2d595fffd05bc5e6417b157276547fb.hip deleted file mode 100644 index d6e33f0a43ed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f6ccdb3c2d595fffd05bc5e6417b157276547fb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f80d44e82e601dc48d4c8b4e710ef7265894b6c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f80d44e82e601dc48d4c8b4e710ef7265894b6c.hip deleted file mode 100644 index 04eab9f3a2c9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f80d44e82e601dc48d4c8b4e710ef7265894b6c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9403cb91d6aabebf081afae94a8ba397d8d24f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9403cb91d6aabebf081afae94a8ba397d8d24f.hip deleted file mode 100644 index e1c494516ffc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9403cb91d6aabebf081afae94a8ba397d8d24f.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9bb3486fee7b7c9e24300b8a4e4ce88a11bfc0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9bb3486fee7b7c9e24300b8a4e4ce88a11bfc0.hip deleted file mode 100644 index 0139958c7459..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7f9bb3486fee7b7c9e24300b8a4e4ce88a11bfc0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fa76fc1b066a15b08dc6c24a7cf33a58b4cb6cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fa76fc1b066a15b08dc6c24a7cf33a58b4cb6cb.hip deleted file mode 100644 index 8a13f4c6e39d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fa76fc1b066a15b08dc6c24a7cf33a58b4cb6cb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fe409f4421193fb48a54aa5f26bd6229d23204c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fe409f4421193fb48a54aa5f26bd6229d23204c.hip deleted file mode 100644 index 58d6c86822d5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7fe409f4421193fb48a54aa5f26bd6229d23204c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ff65c7abd9b0d8a2df9302d6dc167637b3a72f0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ff65c7abd9b0d8a2df9302d6dc167637b3a72f0.hip deleted file mode 100644 index a408de679290..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_7ff65c7abd9b0d8a2df9302d6dc167637b3a72f0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8004763f674dfb3f14b66dfdeb2a046e413ce2cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8004763f674dfb3f14b66dfdeb2a046e413ce2cb.hip deleted file mode 100644 index 5e9b6e48392d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8004763f674dfb3f14b66dfdeb2a046e413ce2cb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8007bf7ae1b71bf8ac4a793aa519ad333aa7a7ba.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8007bf7ae1b71bf8ac4a793aa519ad333aa7a7ba.hip deleted file mode 100644 index 3e015f8f7029..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8007bf7ae1b71bf8ac4a793aa519ad333aa7a7ba.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8021fa266c77e6b5bd1af2a9c22c686e5a6eac78.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8021fa266c77e6b5bd1af2a9c22c686e5a6eac78.hip deleted file mode 100644 index ac28a3328468..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8021fa266c77e6b5bd1af2a9c22c686e5a6eac78.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_802b21f9588d72c3c3e3b9a3b269f19c484d5aa4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_802b21f9588d72c3c3e3b9a3b269f19c484d5aa4.hip deleted file mode 100644 index 4b60fc9210e0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_802b21f9588d72c3c3e3b9a3b269f19c484d5aa4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8046f566fa7188c92568b277354e8b06ad382544.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8046f566fa7188c92568b277354e8b06ad382544.hip deleted file mode 100644 index c215fe79952f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8046f566fa7188c92568b277354e8b06ad382544.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_806f9ab9baf631df1d3a8d801e4cf93a102526cf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_806f9ab9baf631df1d3a8d801e4cf93a102526cf.hip deleted file mode 100644 index 15f6db450b18..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_806f9ab9baf631df1d3a8d801e4cf93a102526cf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_807545400aa6e70ff49a5f38ed6a218a180bd87f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_807545400aa6e70ff49a5f38ed6a218a180bd87f.hip deleted file mode 100644 index 1a617692a47f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_807545400aa6e70ff49a5f38ed6a218a180bd87f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80987e2d765efc320eaee813607c94c80ee35aa4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80987e2d765efc320eaee813607c94c80ee35aa4.hip deleted file mode 100644 index 3f19fb11a2f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80987e2d765efc320eaee813607c94c80ee35aa4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80a72d70d80b66c19e85daa00497308381050048.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80a72d70d80b66c19e85daa00497308381050048.hip deleted file mode 100644 index 61ad99831392..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80a72d70d80b66c19e85daa00497308381050048.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80bfb0e6032892cc58cef4dd403f305a5b76851b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80bfb0e6032892cc58cef4dd403f305a5b76851b.hip deleted file mode 100644 index 086ab0ba7a54..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80bfb0e6032892cc58cef4dd403f305a5b76851b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80cf0997573f4bcfbaaf75e40f519580a7495a17.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80cf0997573f4bcfbaaf75e40f519580a7495a17.hip deleted file mode 100644 index c7eff142a038..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80cf0997573f4bcfbaaf75e40f519580a7495a17.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80efc341089a50ed5669b3c86f6ddd9b124d1442.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80efc341089a50ed5669b3c86f6ddd9b124d1442.hip deleted file mode 100644 index 4cbab736b6b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80efc341089a50ed5669b3c86f6ddd9b124d1442.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80f51f0e178c33e6196df1d2e47bd38bf5391cc8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80f51f0e178c33e6196df1d2e47bd38bf5391cc8.hip deleted file mode 100644 index 66cd7a461f77..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80f51f0e178c33e6196df1d2e47bd38bf5391cc8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80fb694fce7b4c3c459fca43c89c6002fbfdaef5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80fb694fce7b4c3c459fca43c89c6002fbfdaef5.hip deleted file mode 100644 index 2658df162189..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_80fb694fce7b4c3c459fca43c89c6002fbfdaef5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_810dd4e870ceda3ba9b5f0084a4b025b2e609d57.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_810dd4e870ceda3ba9b5f0084a4b025b2e609d57.hip deleted file mode 100644 index 4ab0c214f0b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_810dd4e870ceda3ba9b5f0084a4b025b2e609d57.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_811db756577b61cde9fe8279d956980db9ee21a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_811db756577b61cde9fe8279d956980db9ee21a4.hip deleted file mode 100644 index de717ef0b7a7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_811db756577b61cde9fe8279d956980db9ee21a4.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_813e60e8405aca3f7fbed19452ae37574ada9a77.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_813e60e8405aca3f7fbed19452ae37574ada9a77.hip deleted file mode 100644 index 33aae40fecb2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_813e60e8405aca3f7fbed19452ae37574ada9a77.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_815918206483d2ae04a45aa67d69dfb986587214.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_815918206483d2ae04a45aa67d69dfb986587214.hip deleted file mode 100644 index b6e2d5b3410e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_815918206483d2ae04a45aa67d69dfb986587214.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_816c48e129a0235cb3a19124ddb28cce286fb368.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_816c48e129a0235cb3a19124ddb28cce286fb368.hip deleted file mode 100644 index fb3b8e71300b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_816c48e129a0235cb3a19124ddb28cce286fb368.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81acf1d17650712b71a499bb66909bfcfcb6aecb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81acf1d17650712b71a499bb66909bfcfcb6aecb.hip deleted file mode 100644 index 219fd7dd1c9c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81acf1d17650712b71a499bb66909bfcfcb6aecb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81bb8f13b6f20a72c9ce6d0b53f81eddbf05f1c6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81bb8f13b6f20a72c9ce6d0b53f81eddbf05f1c6.hip deleted file mode 100644 index 1b86e63fc6e0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81bb8f13b6f20a72c9ce6d0b53f81eddbf05f1c6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81dd3ea61bb61de02667b14f5a94198f48c7307b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81dd3ea61bb61de02667b14f5a94198f48c7307b.hip deleted file mode 100644 index dfe9501b99a7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81dd3ea61bb61de02667b14f5a94198f48c7307b.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81f6c575c3fa2ccc7e65022f1ba65c8cfc16541e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81f6c575c3fa2ccc7e65022f1ba65c8cfc16541e.hip deleted file mode 100644 index fab9835e1c0c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_81f6c575c3fa2ccc7e65022f1ba65c8cfc16541e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82048cf91270631f98ac37dc488a1fb2e00ce004.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82048cf91270631f98ac37dc488a1fb2e00ce004.hip deleted file mode 100644 index 9d110471afdb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82048cf91270631f98ac37dc488a1fb2e00ce004.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8250f27341241086515d833aa53ae873d4ece3fa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8250f27341241086515d833aa53ae873d4ece3fa.hip deleted file mode 100644 index 7b2f9217aa6b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8250f27341241086515d833aa53ae873d4ece3fa.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8278845045d68027dcf3bf867ecde2fb12ec51d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8278845045d68027dcf3bf867ecde2fb12ec51d3.hip deleted file mode 100644 index bf38e35bfca1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8278845045d68027dcf3bf867ecde2fb12ec51d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82ad0c0580516485ea432d98f53e73f6dfec548c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82ad0c0580516485ea432d98f53e73f6dfec548c.hip deleted file mode 100644 index 2b4150ecc8ee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82ad0c0580516485ea432d98f53e73f6dfec548c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82c932e6eaaf44861c794539d9caf8b50192fc44.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82c932e6eaaf44861c794539d9caf8b50192fc44.hip deleted file mode 100644 index 8b2eed5f8005..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82c932e6eaaf44861c794539d9caf8b50192fc44.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82d7f61e6313930f063758b61102e7a43b118beb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82d7f61e6313930f063758b61102e7a43b118beb.hip deleted file mode 100644 index 8f7f3de158ba..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82d7f61e6313930f063758b61102e7a43b118beb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f0f3d71108dcc49234a258f0f3b21ea2123cc0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f0f3d71108dcc49234a258f0f3b21ea2123cc0.hip deleted file mode 100644 index 93daa709c449..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f0f3d71108dcc49234a258f0f3b21ea2123cc0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f1d7e1a93bf2fa80c409e6827ea88af56c44f0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f1d7e1a93bf2fa80c409e6827ea88af56c44f0.hip deleted file mode 100644 index 54f9ec27a3fe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_82f1d7e1a93bf2fa80c409e6827ea88af56c44f0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8301bfc0394936a68fa0098580f06e77c88ebed9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8301bfc0394936a68fa0098580f06e77c88ebed9.hip deleted file mode 100644 index 4cf9f48de04e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8301bfc0394936a68fa0098580f06e77c88ebed9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83080406598df6bd3102db70a554e496e29db96a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83080406598df6bd3102db70a554e496e29db96a.hip deleted file mode 100644 index 3144707a1e30..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83080406598df6bd3102db70a554e496e29db96a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_830e3532f27b391585d5de90f3bdf97992b67651.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_830e3532f27b391585d5de90f3bdf97992b67651.hip deleted file mode 100644 index 6c320947ef89..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_830e3532f27b391585d5de90f3bdf97992b67651.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8352031044ef2e4a22e27ad04ab5d2c02121faee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8352031044ef2e4a22e27ad04ab5d2c02121faee.hip deleted file mode 100644 index 88b13709d5ec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8352031044ef2e4a22e27ad04ab5d2c02121faee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_835a906031a258c6362313eec783678bd8125c91.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_835a906031a258c6362313eec783678bd8125c91.hip deleted file mode 100644 index e0a251e22ddd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_835a906031a258c6362313eec783678bd8125c91.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_836a308c2d2afd6e0dfbfda61984b631c4ccffc6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_836a308c2d2afd6e0dfbfda61984b631c4ccffc6.hip deleted file mode 100644 index 2c4cdbac3707..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_836a308c2d2afd6e0dfbfda61984b631c4ccffc6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d580a612af85533c87aecdd7b0345c71b75980.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d580a612af85533c87aecdd7b0345c71b75980.hip deleted file mode 100644 index 306db4dc5008..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d580a612af85533c87aecdd7b0345c71b75980.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d920a76114c63156740ba5dd6f3846c4b21c28.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d920a76114c63156740ba5dd6f3846c4b21c28.hip deleted file mode 100644 index 204b6dee7771..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83d920a76114c63156740ba5dd6f3846c4b21c28.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83ddca2c6ecbba4314c434e7471ffb8fa642f936.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83ddca2c6ecbba4314c434e7471ffb8fa642f936.hip deleted file mode 100644 index ac353df75b24..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83ddca2c6ecbba4314c434e7471ffb8fa642f936.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83f6a1837a65df12b7c55d25ca28cc939c2a6328.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83f6a1837a65df12b7c55d25ca28cc939c2a6328.hip deleted file mode 100644 index e0f7b733dadc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_83f6a1837a65df12b7c55d25ca28cc939c2a6328.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_843e7888cba5f463d19fcb71aaaab25dc3d2c09d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_843e7888cba5f463d19fcb71aaaab25dc3d2c09d.hip deleted file mode 100644 index 10012bf1b8dc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_843e7888cba5f463d19fcb71aaaab25dc3d2c09d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8441910c34830ad2459fb85c2c14af02da718fdc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8441910c34830ad2459fb85c2c14af02da718fdc.hip deleted file mode 100644 index 09f65ddffee9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8441910c34830ad2459fb85c2c14af02da718fdc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8457ea5726149efb8778e6d90798b8e48288fc9a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8457ea5726149efb8778e6d90798b8e48288fc9a.hip deleted file mode 100644 index 1168a111bfc3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8457ea5726149efb8778e6d90798b8e48288fc9a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_847feaf237911478173377a501ee19ee325b012b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_847feaf237911478173377a501ee19ee325b012b.hip deleted file mode 100644 index fb8c2ff465ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_847feaf237911478173377a501ee19ee325b012b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84cca7528c7d1bf49ba79625733ff0ae7522c096.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84cca7528c7d1bf49ba79625733ff0ae7522c096.hip deleted file mode 100644 index 09957d445dcf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84cca7528c7d1bf49ba79625733ff0ae7522c096.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84dc4af43de08130a04bfa06df9799b6e9e96900.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84dc4af43de08130a04bfa06df9799b6e9e96900.hip deleted file mode 100644 index 5bcfa28a7c21..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84dc4af43de08130a04bfa06df9799b6e9e96900.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84e8ae99e184013739019c93d07caddce532382b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84e8ae99e184013739019c93d07caddce532382b.hip deleted file mode 100644 index 14b46ad67ae2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84e8ae99e184013739019c93d07caddce532382b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84fc5e94f89d6a9287cf64662a372784511468dd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84fc5e94f89d6a9287cf64662a372784511468dd.hip deleted file mode 100644 index a6bef398368a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_84fc5e94f89d6a9287cf64662a372784511468dd.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8513d96a66a4d9fb8dfc84afba7e1d8c200248a6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8513d96a66a4d9fb8dfc84afba7e1d8c200248a6.hip deleted file mode 100644 index d99e00fd35a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8513d96a66a4d9fb8dfc84afba7e1d8c200248a6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85156f2c556c6ef6180608c361b7b35ede71ffea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85156f2c556c6ef6180608c361b7b35ede71ffea.hip deleted file mode 100644 index 785187122a1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85156f2c556c6ef6180608c361b7b35ede71ffea.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_854c8003a508ed3f8cbe6967c4ae2635a491c721.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_854c8003a508ed3f8cbe6967c4ae2635a491c721.hip deleted file mode 100644 index b44495ac38bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_854c8003a508ed3f8cbe6967c4ae2635a491c721.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85908fe6dc9c629c82d6953081b10021e64583b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85908fe6dc9c629c82d6953081b10021e64583b1.hip deleted file mode 100644 index 7ecdf21a02d7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85908fe6dc9c629c82d6953081b10021e64583b1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85960fe542635079de5eca3c7785890cd4740005.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85960fe542635079de5eca3c7785890cd4740005.hip deleted file mode 100644 index 849aa28fc080..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85960fe542635079de5eca3c7785890cd4740005.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85fdde4b25e2fc8cbdd46c2850c19eac8d9af8f6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85fdde4b25e2fc8cbdd46c2850c19eac8d9af8f6.hip deleted file mode 100644 index 00be09d0ffcc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_85fdde4b25e2fc8cbdd46c2850c19eac8d9af8f6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86309c036d96367939ccc3e8922595ac35a3e179.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86309c036d96367939ccc3e8922595ac35a3e179.hip deleted file mode 100644 index 8057a5c80445..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86309c036d96367939ccc3e8922595ac35a3e179.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86513d6e065a44bcb0c789eed1e7e5456e800ab6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86513d6e065a44bcb0c789eed1e7e5456e800ab6.hip deleted file mode 100644 index cb886078cd81..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86513d6e065a44bcb0c789eed1e7e5456e800ab6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_865eb90b1a2d64acc0f6fbe1d807c501fd4be3cd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_865eb90b1a2d64acc0f6fbe1d807c501fd4be3cd.hip deleted file mode 100644 index f970343ef954..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_865eb90b1a2d64acc0f6fbe1d807c501fd4be3cd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8689126a7eb09d81baaf8f99dbff8932fbeab3cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8689126a7eb09d81baaf8f99dbff8932fbeab3cb.hip deleted file mode 100644 index 8a835605072c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8689126a7eb09d81baaf8f99dbff8932fbeab3cb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86d73393d0d8b769f30222f7817563a955c36dfc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86d73393d0d8b769f30222f7817563a955c36dfc.hip deleted file mode 100644 index 1ae87b99d6fc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86d73393d0d8b769f30222f7817563a955c36dfc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86fa51b8c7a2f3fac5cf4cd2951ed2ede5c35450.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86fa51b8c7a2f3fac5cf4cd2951ed2ede5c35450.hip deleted file mode 100644 index 5211cf9c9bea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_86fa51b8c7a2f3fac5cf4cd2951ed2ede5c35450.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_875b08ca602fe48840c72cd61798acb98540fcd6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_875b08ca602fe48840c72cd61798acb98540fcd6.hip deleted file mode 100644 index f484bc9e71bf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_875b08ca602fe48840c72cd61798acb98540fcd6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_876a418fbe6183d0392b7a7d9986d067e323e2b9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_876a418fbe6183d0392b7a7d9986d067e323e2b9.hip deleted file mode 100644 index f33f73433e6f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_876a418fbe6183d0392b7a7d9986d067e323e2b9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_877e33463b3bf1853c6d2d2009af8d27bf88abbe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_877e33463b3bf1853c6d2d2009af8d27bf88abbe.hip deleted file mode 100644 index 20166c479009..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_877e33463b3bf1853c6d2d2009af8d27bf88abbe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8793dc3217e154b65ebba065aa10ab4dc2374ae8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8793dc3217e154b65ebba065aa10ab4dc2374ae8.hip deleted file mode 100644 index 745fa19f66d9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8793dc3217e154b65ebba065aa10ab4dc2374ae8.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::fp16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_87e3a06266deda093bdf28af82d8666066157fc6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_87e3a06266deda093bdf28af82d8666066157fc6.hip deleted file mode 100644 index 43ea7e1a8a24..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_87e3a06266deda093bdf28af82d8666066157fc6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8840e8899b4e632714632450bcef001c6070f955.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8840e8899b4e632714632450bcef001c6070f955.hip deleted file mode 100644 index e83e3c95bb81..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8840e8899b4e632714632450bcef001c6070f955.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ac7f6cbdfca2e397bcb86af4216e87166601c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ac7f6cbdfca2e397bcb86af4216e87166601c7.hip deleted file mode 100644 index fa4108d06f38..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ac7f6cbdfca2e397bcb86af4216e87166601c7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88c04463f9c5ce565a9daa8c22e16de80fadd707.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88c04463f9c5ce565a9daa8c22e16de80fadd707.hip deleted file mode 100644 index 282c11cb4bec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88c04463f9c5ce565a9daa8c22e16de80fadd707.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88d52c5f70abb525b9c8aa8fc1cb3997c33ed67c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88d52c5f70abb525b9c8aa8fc1cb3997c33ed67c.hip deleted file mode 100644 index ef926d879099..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88d52c5f70abb525b9c8aa8fc1cb3997c33ed67c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ea5b5346c87cc4fc1e841c518080df4ab811a2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ea5b5346c87cc4fc1e841c518080df4ab811a2.hip deleted file mode 100644 index 7717baf29b11..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ea5b5346c87cc4fc1e841c518080df4ab811a2.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ed7f650c958a644c8031aeb88688b1e42458e5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ed7f650c958a644c8031aeb88688b1e42458e5.hip deleted file mode 100644 index 171f415499bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_88ed7f650c958a644c8031aeb88688b1e42458e5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_890aa875ac13957f00b30210477924697abf0c9e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_890aa875ac13957f00b30210477924697abf0c9e.hip deleted file mode 100644 index d4342466be28..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_890aa875ac13957f00b30210477924697abf0c9e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89617bdea526d12d6a33ed42b9b0018c0b173722.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89617bdea526d12d6a33ed42b9b0018c0b173722.hip deleted file mode 100644 index e1f24ddd6cdf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89617bdea526d12d6a33ed42b9b0018c0b173722.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89a3327da9a3411ff1cddc67eb647083cd947a92.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89a3327da9a3411ff1cddc67eb647083cd947a92.hip deleted file mode 100644 index 7d4b97f05e36..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_89a3327da9a3411ff1cddc67eb647083cd947a92.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a1fd28acfe85b3adac859c4bbffa4d28fe634fe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a1fd28acfe85b3adac859c4bbffa4d28fe634fe.hip deleted file mode 100644 index 8a57e89bea40..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a1fd28acfe85b3adac859c4bbffa4d28fe634fe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a58d4bca33c4c0e79141a56688049237d170d1b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a58d4bca33c4c0e79141a56688049237d170d1b.hip deleted file mode 100644 index 7f79025e61a5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a58d4bca33c4c0e79141a56688049237d170d1b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a824621a50cdc3cbadc4b1f9ef18e1325385082.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a824621a50cdc3cbadc4b1f9ef18e1325385082.hip deleted file mode 100644 index 030234915a54..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a824621a50cdc3cbadc4b1f9ef18e1325385082.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a980749c6b2a18c80426dd189e5506334343ca4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a980749c6b2a18c80426dd189e5506334343ca4.hip deleted file mode 100644 index 40a903ca6124..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8a980749c6b2a18c80426dd189e5506334343ca4.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8adbdcd28cb2f078f89adf9aad2b3d4a0a477823.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8adbdcd28cb2f078f89adf9aad2b3d4a0a477823.hip deleted file mode 100644 index 4dde718429f9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8adbdcd28cb2f078f89adf9aad2b3d4a0a477823.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b17c082f249649eca733a8f0cdf9a1205c3e3d7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b17c082f249649eca733a8f0cdf9a1205c3e3d7.hip deleted file mode 100644 index fde18547e502..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b17c082f249649eca733a8f0cdf9a1205c3e3d7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b9043572cabb65435627a3faf23b18d039bbcd8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b9043572cabb65435627a3faf23b18d039bbcd8.hip deleted file mode 100644 index ea134058ed3c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b9043572cabb65435627a3faf23b18d039bbcd8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b92990df507e82f96eeb7aa3ec00c01437566fb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b92990df507e82f96eeb7aa3ec00c01437566fb.hip deleted file mode 100644 index 613b337ac3af..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8b92990df507e82f96eeb7aa3ec00c01437566fb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd1a40b12ce927323594fcce61eb9c20cc5e3d4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd1a40b12ce927323594fcce61eb9c20cc5e3d4.hip deleted file mode 100644 index 45da6271b945..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd1a40b12ce927323594fcce61eb9c20cc5e3d4.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd7b8c63a51c8639b3cf27ad09d41ae47c480d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd7b8c63a51c8639b3cf27ad09d41ae47c480d3.hip deleted file mode 100644 index e9e547b584b2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8bd7b8c63a51c8639b3cf27ad09d41ae47c480d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c074afcf33e3f3534ac3577484237fcfd2ca48e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c074afcf33e3f3534ac3577484237fcfd2ca48e.hip deleted file mode 100644 index 76c854d485a9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c074afcf33e3f3534ac3577484237fcfd2ca48e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c13c4f3f645a2bb475eb1c55ce1de452f0e2332.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c13c4f3f645a2bb475eb1c55ce1de452f0e2332.hip deleted file mode 100644 index 71115563f028..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c13c4f3f645a2bb475eb1c55ce1de452f0e2332.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c3bd4e029bba76ebfc79e6522dbc8ca0bba5dd2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c3bd4e029bba76ebfc79e6522dbc8ca0bba5dd2.hip deleted file mode 100644 index 0bc60f2df126..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c3bd4e029bba76ebfc79e6522dbc8ca0bba5dd2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c4688cbd23727dd0ea9a36fb977b31aeae98d65.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c4688cbd23727dd0ea9a36fb977b31aeae98d65.hip deleted file mode 100644 index 2d098e3171b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c4688cbd23727dd0ea9a36fb977b31aeae98d65.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c7970957024de050748d3e31cef434f582d968b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c7970957024de050748d3e31cef434f582d968b.hip deleted file mode 100644 index 40e2545786a4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8c7970957024de050748d3e31cef434f582d968b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cdcdeb845e7bcdb89ef70ab2a97157d4db3cb52.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cdcdeb845e7bcdb89ef70ab2a97157d4db3cb52.hip deleted file mode 100644 index ae7548200010..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cdcdeb845e7bcdb89ef70ab2a97157d4db3cb52.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cf1007430da272174d3476d042f398627e83512.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cf1007430da272174d3476d042f398627e83512.hip deleted file mode 100644 index 8e0f9e699cb1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8cf1007430da272174d3476d042f398627e83512.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d079c1eb36db8461fa8b861c56760afcd97cc34.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d079c1eb36db8461fa8b861c56760afcd97cc34.hip deleted file mode 100644 index dfa92cb08266..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d079c1eb36db8461fa8b861c56760afcd97cc34.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d7549e66ef309e32779ddc2a1f14e79bae53754.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d7549e66ef309e32779ddc2a1f14e79bae53754.hip deleted file mode 100644 index 3232ed484959..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d7549e66ef309e32779ddc2a1f14e79bae53754.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d79fe8a600c3b4e0ec9aa510f8036ba2b608985.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d79fe8a600c3b4e0ec9aa510f8036ba2b608985.hip deleted file mode 100644 index d2c743d56bb2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8d79fe8a600c3b4e0ec9aa510f8036ba2b608985.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8da8285bd6182355e3164cdc5a983375cdf0a61d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8da8285bd6182355e3164cdc5a983375cdf0a61d.hip deleted file mode 100644 index a5566ff44ac4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8da8285bd6182355e3164cdc5a983375cdf0a61d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e1b48a28b71c7f4c78eb14321b39951a7c5e903.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e1b48a28b71c7f4c78eb14321b39951a7c5e903.hip deleted file mode 100644 index 4e55049aeccc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e1b48a28b71c7f4c78eb14321b39951a7c5e903.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2c587db8bd9f1b551624e0cf8b67a90245d7da.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2c587db8bd9f1b551624e0cf8b67a90245d7da.hip deleted file mode 100644 index 8e27eebc36c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2c587db8bd9f1b551624e0cf8b67a90245d7da.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2d5f979fc4fbd0991581a020a414f9c8656ae2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2d5f979fc4fbd0991581a020a414f9c8656ae2.hip deleted file mode 100644 index 06d937a7ddd5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e2d5f979fc4fbd0991581a020a414f9c8656ae2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e431313fe082958d31b68d2fd0d61df0fe56736.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e431313fe082958d31b68d2fd0d61df0fe56736.hip deleted file mode 100644 index 7941750c3330..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e431313fe082958d31b68d2fd0d61df0fe56736.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e50ea8dd480012cbe10be392cd26d1870e6ef9b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e50ea8dd480012cbe10be392cd26d1870e6ef9b.hip deleted file mode 100644 index a7bf9d7c2dc1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e50ea8dd480012cbe10be392cd26d1870e6ef9b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e675919a6c7758cbbeecb83b7ac6c62f95cdb46.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e675919a6c7758cbbeecb83b7ac6c62f95cdb46.hip deleted file mode 100644 index 9ebe18a6d355..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e675919a6c7758cbbeecb83b7ac6c62f95cdb46.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e812705ae3e452810794fa7caceef2ef6066dfb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e812705ae3e452810794fa7caceef2ef6066dfb.hip deleted file mode 100644 index 4a6e1dc46efd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e812705ae3e452810794fa7caceef2ef6066dfb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e816fcad5e9ecfca94a6491eb2274bcc41e558b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e816fcad5e9ecfca94a6491eb2274bcc41e558b.hip deleted file mode 100644 index d78735b25f67..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e816fcad5e9ecfca94a6491eb2274bcc41e558b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e938d0e3ad30db201880642e57758285b2ec4cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e938d0e3ad30db201880642e57758285b2ec4cb.hip deleted file mode 100644 index e3803929cc39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8e938d0e3ad30db201880642e57758285b2ec4cb.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::fp16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8efb5fc2ace6839eac741c5e6616665845f43566.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8efb5fc2ace6839eac741c5e6616665845f43566.hip deleted file mode 100644 index 2dd0c6ae4b21..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8efb5fc2ace6839eac741c5e6616665845f43566.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f607ee20c0d92b6dbd0338f139517fdcce98d0c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f607ee20c0d92b6dbd0338f139517fdcce98d0c.hip deleted file mode 100644 index 8993d8848553..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f607ee20c0d92b6dbd0338f139517fdcce98d0c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f6e463eedd3e65b9c79feed3cd92ad8cbc9f036.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f6e463eedd3e65b9c79feed3cd92ad8cbc9f036.hip deleted file mode 100644 index 574533a38ffe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f6e463eedd3e65b9c79feed3cd92ad8cbc9f036.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f7166d4bb0c1c9b9999ba16a1adbf09ebfdb6f1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f7166d4bb0c1c9b9999ba16a1adbf09ebfdb6f1.hip deleted file mode 100644 index a95ada1b8090..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8f7166d4bb0c1c9b9999ba16a1adbf09ebfdb6f1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fa4c40e244b412a07933d369704bcdaa6d5e74c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fa4c40e244b412a07933d369704bcdaa6d5e74c.hip deleted file mode 100644 index e739f83a24ef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fa4c40e244b412a07933d369704bcdaa6d5e74c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb224b40a7be7db0a9c5c08cc5ab05b526c14e8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb224b40a7be7db0a9c5c08cc5ab05b526c14e8.hip deleted file mode 100644 index 7553a56903c4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb224b40a7be7db0a9c5c08cc5ab05b526c14e8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb33fc20f2e85e915f1b1529ae87981dfcaf86d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb33fc20f2e85e915f1b1529ae87981dfcaf86d.hip deleted file mode 100644 index 2feaeb0eb37e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fb33fc20f2e85e915f1b1529ae87981dfcaf86d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fc08b4f3959a2375ac03f40c4ce12d70cdc2d80.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fc08b4f3959a2375ac03f40c4ce12d70cdc2d80.hip deleted file mode 100644 index bc051fb5d67a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_8fc08b4f3959a2375ac03f40c4ce12d70cdc2d80.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9009b7d39346537aa6c4a4e46b81139f603edb60.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9009b7d39346537aa6c4a4e46b81139f603edb60.hip deleted file mode 100644 index 0a983cd9f076..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9009b7d39346537aa6c4a4e46b81139f603edb60.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_900d7f81c73b35ea64095d01c5d48d9190839e0a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_900d7f81c73b35ea64095d01c5d48d9190839e0a.hip deleted file mode 100644 index 7a43253061cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_900d7f81c73b35ea64095d01c5d48d9190839e0a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9068ba8df8b0e977e9769f6acf6cfee6b00b9922.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9068ba8df8b0e977e9769f6acf6cfee6b00b9922.hip deleted file mode 100644 index 943b25bae028..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9068ba8df8b0e977e9769f6acf6cfee6b00b9922.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_906fa8bf5e992ddc25815486ae9c24d8bfba7227.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_906fa8bf5e992ddc25815486ae9c24d8bfba7227.hip deleted file mode 100644 index 259d2ed9379b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_906fa8bf5e992ddc25815486ae9c24d8bfba7227.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90b17d8cba28cceddb3ef907df878aeef0762d15.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90b17d8cba28cceddb3ef907df878aeef0762d15.hip deleted file mode 100644 index 2b3c375441f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90b17d8cba28cceddb3ef907df878aeef0762d15.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90da0d469cca5c8481504148468460c85a15c559.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90da0d469cca5c8481504148468460c85a15c559.hip deleted file mode 100644 index 349f9825962f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90da0d469cca5c8481504148468460c85a15c559.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90e5c56e92712d00092ba102a5eb5176a3e5d471.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90e5c56e92712d00092ba102a5eb5176a3e5d471.hip deleted file mode 100644 index 45579a587006..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_90e5c56e92712d00092ba102a5eb5176a3e5d471.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_910cb8bd09d287a1566265eb1e8894fe68d3cc81.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_910cb8bd09d287a1566265eb1e8894fe68d3cc81.hip deleted file mode 100644 index 934419fd20a7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_910cb8bd09d287a1566265eb1e8894fe68d3cc81.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_915b75db795dbef037b14b003ee073665fe35d3e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_915b75db795dbef037b14b003ee073665fe35d3e.hip deleted file mode 100644 index 472dcad3aba7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_915b75db795dbef037b14b003ee073665fe35d3e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9163ae070075f26926a86d39e15c27e6edb1f1cf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9163ae070075f26926a86d39e15c27e6edb1f1cf.hip deleted file mode 100644 index 7ba85fd26fc5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9163ae070075f26926a86d39e15c27e6edb1f1cf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91695dea4171747fb3cc6d910459f800608d07c1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91695dea4171747fb3cc6d910459f800608d07c1.hip deleted file mode 100644 index 6f1a50eaf856..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91695dea4171747fb3cc6d910459f800608d07c1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_919ae177b7a793fa352c4f6bb8e4175f3064d814.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_919ae177b7a793fa352c4f6bb8e4175f3064d814.hip deleted file mode 100644 index 319a938f0092..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_919ae177b7a793fa352c4f6bb8e4175f3064d814.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91a6200e36944b1f11106c02f7fcee053f01ee71.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91a6200e36944b1f11106c02f7fcee053f01ee71.hip deleted file mode 100644 index b9d3651bc47e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91a6200e36944b1f11106c02f7fcee053f01ee71.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91b9e2616c2fe0480096b1ccf0f74d584b220146.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91b9e2616c2fe0480096b1ccf0f74d584b220146.hip deleted file mode 100644 index 64d6f0a659bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91b9e2616c2fe0480096b1ccf0f74d584b220146.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91c916e14198f6d18dc89915e379b01070434e91.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91c916e14198f6d18dc89915e379b01070434e91.hip deleted file mode 100644 index 7458f3ff3d25..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_91c916e14198f6d18dc89915e379b01070434e91.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9207a63fc55c411c73e4f93306c5ffed800dd249.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9207a63fc55c411c73e4f93306c5ffed800dd249.hip deleted file mode 100644 index 30096f88e790..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9207a63fc55c411c73e4f93306c5ffed800dd249.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92121fd448b4640a17e1a7fe73bb7b58714c0afb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92121fd448b4640a17e1a7fe73bb7b58714c0afb.hip deleted file mode 100644 index fe0620251d5f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92121fd448b4640a17e1a7fe73bb7b58714c0afb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_921f789d619db6f225e8e9d646e93bbc9dc1a669.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_921f789d619db6f225e8e9d646e93bbc9dc1a669.hip deleted file mode 100644 index 60b820477d09..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_921f789d619db6f225e8e9d646e93bbc9dc1a669.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92739f4464512feee083b875e11e11eee4f5b448.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92739f4464512feee083b875e11e11eee4f5b448.hip deleted file mode 100644 index 7f77ad653acc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92739f4464512feee083b875e11e11eee4f5b448.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92992be6252f2afdc368bd4baec4b8a55ae0abf8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92992be6252f2afdc368bd4baec4b8a55ae0abf8.hip deleted file mode 100644 index 32e51b40be0b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92992be6252f2afdc368bd4baec4b8a55ae0abf8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b0770fe64e3c60b9e56170aa88bbf74802a813.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b0770fe64e3c60b9e56170aa88bbf74802a813.hip deleted file mode 100644 index 6acc4ea1078f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b0770fe64e3c60b9e56170aa88bbf74802a813.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b722cdabcfaa388ccc6ccceb7e42462f3bdcd1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b722cdabcfaa388ccc6ccceb7e42462f3bdcd1.hip deleted file mode 100644 index b182fd41a86e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92b722cdabcfaa388ccc6ccceb7e42462f3bdcd1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92ba64cdf615c1be2865f027a293cb530fc07dc6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92ba64cdf615c1be2865f027a293cb530fc07dc6.hip deleted file mode 100644 index b2d0f026a38f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92ba64cdf615c1be2865f027a293cb530fc07dc6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92d841e6d783bb46d841aafd9027f92dd1b61b88.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92d841e6d783bb46d841aafd9027f92dd1b61b88.hip deleted file mode 100644 index f90fd93ffce9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92d841e6d783bb46d841aafd9027f92dd1b61b88.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92e53359c69bbe4d7405d45261a8a62008eb7d06.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92e53359c69bbe4d7405d45261a8a62008eb7d06.hip deleted file mode 100644 index 1f76859fac80..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92e53359c69bbe4d7405d45261a8a62008eb7d06.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92f9ad0fb65638cfffb3e7786f2cbf01d9585b23.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92f9ad0fb65638cfffb3e7786f2cbf01d9585b23.hip deleted file mode 100644 index 343d71f5d5f7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_92f9ad0fb65638cfffb3e7786f2cbf01d9585b23.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93054acb8a9508fd0f0f486367fb62454de47c39.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93054acb8a9508fd0f0f486367fb62454de47c39.hip deleted file mode 100644 index cbbd56c6c335..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93054acb8a9508fd0f0f486367fb62454de47c39.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_931cf8d05cfa45319f4e5bb49334d35a530bffcf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_931cf8d05cfa45319f4e5bb49334d35a530bffcf.hip deleted file mode 100644 index c5626028e95e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_931cf8d05cfa45319f4e5bb49334d35a530bffcf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93728d999ae43ee1b5a16e60b90cf8533c7d303f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93728d999ae43ee1b5a16e60b90cf8533c7d303f.hip deleted file mode 100644 index 1ce05165d664..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93728d999ae43ee1b5a16e60b90cf8533c7d303f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937801fbb43fb6797f0425f08d13926b74d87c4a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937801fbb43fb6797f0425f08d13926b74d87c4a.hip deleted file mode 100644 index fb710a2ba713..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937801fbb43fb6797f0425f08d13926b74d87c4a.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937c48d0b7096ad6c8bc445f13f2c8c1934695ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937c48d0b7096ad6c8bc445f13f2c8c1934695ab.hip deleted file mode 100644 index 3456274dfce7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_937c48d0b7096ad6c8bc445f13f2c8c1934695ab.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93b885d6869400b0dc2ef1b2c2636ddfd21cde31.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93b885d6869400b0dc2ef1b2c2636ddfd21cde31.hip deleted file mode 100644 index a93a65373337..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_93b885d6869400b0dc2ef1b2c2636ddfd21cde31.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_942439e4f5644a3a4630481bc7d98834b29b6e1c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_942439e4f5644a3a4630481bc7d98834b29b6e1c.hip deleted file mode 100644 index f95269dbae6a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_942439e4f5644a3a4630481bc7d98834b29b6e1c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94a94d145e575747c8956ac703810582c819e2e8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94a94d145e575747c8956ac703810582c819e2e8.hip deleted file mode 100644 index f095f167e5a1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94a94d145e575747c8956ac703810582c819e2e8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94aa519eb57e5797125728492d9330f5c0f0670a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94aa519eb57e5797125728492d9330f5c0f0670a.hip deleted file mode 100644 index d5b99a089e94..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94aa519eb57e5797125728492d9330f5c0f0670a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94f6f9dee9f0c3825d91f4d320a5280070e60ee7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94f6f9dee9f0c3825d91f4d320a5280070e60ee7.hip deleted file mode 100644 index 33dbc942249a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_94f6f9dee9f0c3825d91f4d320a5280070e60ee7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95061acc6650fc7b79fa1fe5b2b1e083555eec2c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95061acc6650fc7b79fa1fe5b2b1e083555eec2c.hip deleted file mode 100644 index a42f11ace223..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95061acc6650fc7b79fa1fe5b2b1e083555eec2c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_951343832a5bfd060c8d12da0d8a090f070a717d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_951343832a5bfd060c8d12da0d8a090f070a717d.hip deleted file mode 100644 index 3805805858e2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_951343832a5bfd060c8d12da0d8a090f070a717d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9545f95c1093c60f0fb6c794636f79aaeb53b733.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9545f95c1093c60f0fb6c794636f79aaeb53b733.hip deleted file mode 100644 index 9ea99eae30f0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9545f95c1093c60f0fb6c794636f79aaeb53b733.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95530399ad7b43d8ce2c89da24c71056f2146b18.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95530399ad7b43d8ce2c89da24c71056f2146b18.hip deleted file mode 100644 index 4d406596d68c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_95530399ad7b43d8ce2c89da24c71056f2146b18.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9583148fd684a7e6a312127e023798278415bd27.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9583148fd684a7e6a312127e023798278415bd27.hip deleted file mode 100644 index 490b196957e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9583148fd684a7e6a312127e023798278415bd27.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9594816877815bc0294610ca24f986fdccdc7c6f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9594816877815bc0294610ca24f986fdccdc7c6f.hip deleted file mode 100644 index 0253bcf4a3c3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9594816877815bc0294610ca24f986fdccdc7c6f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_960ecb3013071fb65f2d5ed4c947c4bf303e5308.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_960ecb3013071fb65f2d5ed4c947c4bf303e5308.hip deleted file mode 100644 index bd228e133296..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_960ecb3013071fb65f2d5ed4c947c4bf303e5308.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9638c9618dbf2af119e37596f7eb0fd3f8d72748.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9638c9618dbf2af119e37596f7eb0fd3f8d72748.hip deleted file mode 100644 index 0b0de06043d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9638c9618dbf2af119e37596f7eb0fd3f8d72748.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_963986150adcd6e1d3886bacf2166de1252e14df.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_963986150adcd6e1d3886bacf2166de1252e14df.hip deleted file mode 100644 index 24e465ac22cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_963986150adcd6e1d3886bacf2166de1252e14df.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_964f916d3484295b5918e2e4c22c5529588a5662.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_964f916d3484295b5918e2e4c22c5529588a5662.hip deleted file mode 100644 index c78ed25690b7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_964f916d3484295b5918e2e4c22c5529588a5662.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9689ecd7bf51bcffe9f5002959bdda41c50a3c8b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9689ecd7bf51bcffe9f5002959bdda41c50a3c8b.hip deleted file mode 100644 index 24a1c2d5bc17..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9689ecd7bf51bcffe9f5002959bdda41c50a3c8b.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_968fc75a7d102aca068e3ceb6111728c280fa837.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_968fc75a7d102aca068e3ceb6111728c280fa837.hip deleted file mode 100644 index f85d1366ac32..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_968fc75a7d102aca068e3ceb6111728c280fa837.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c129dd4c798343d6f78ab78056f0faf2f1c9d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c129dd4c798343d6f78ab78056f0faf2f1c9d3.hip deleted file mode 100644 index 83b5d416c246..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c129dd4c798343d6f78ab78056f0faf2f1c9d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c5e79f54b71677124f555b0ae4bfd27248d099.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c5e79f54b71677124f555b0ae4bfd27248d099.hip deleted file mode 100644 index 4d3fe46ed60e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96c5e79f54b71677124f555b0ae4bfd27248d099.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96caa2056d99eb67ada498e287b4fae984397691.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96caa2056d99eb67ada498e287b4fae984397691.hip deleted file mode 100644 index d014ecfc46f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96caa2056d99eb67ada498e287b4fae984397691.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96dee49ec6755006d67f0c30c65f50558bba69b0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96dee49ec6755006d67f0c30c65f50558bba69b0.hip deleted file mode 100644 index 684f8b4637cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96dee49ec6755006d67f0c30c65f50558bba69b0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96f1bb85dff8c97846f6b2e8796a6289bcd0d9d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96f1bb85dff8c97846f6b2e8796a6289bcd0d9d3.hip deleted file mode 100644 index 223e3e97cac1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_96f1bb85dff8c97846f6b2e8796a6289bcd0d9d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_970073c70133ff2ee4737f803a0ac43801c47242.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_970073c70133ff2ee4737f803a0ac43801c47242.hip deleted file mode 100644 index 2b525655af45..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_970073c70133ff2ee4737f803a0ac43801c47242.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_971a08c2e48d805b295d979b24173a04cf58def0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_971a08c2e48d805b295d979b24173a04cf58def0.hip deleted file mode 100644 index b8d636466229..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_971a08c2e48d805b295d979b24173a04cf58def0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97246460c21bc66c0f13936d27477a9fca1c44d1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97246460c21bc66c0f13936d27477a9fca1c44d1.hip deleted file mode 100644 index e9da11dde247..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97246460c21bc66c0f13936d27477a9fca1c44d1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9745b04a8026a01828c5dd606d89d044d3ed1d99.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9745b04a8026a01828c5dd606d89d044d3ed1d99.hip deleted file mode 100644 index 2b7c4276f99c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9745b04a8026a01828c5dd606d89d044d3ed1d99.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_976cf509d9c2bf86ba6ee5ded544fa8e6717f590.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_976cf509d9c2bf86ba6ee5ded544fa8e6717f590.hip deleted file mode 100644 index e56905bd29a9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_976cf509d9c2bf86ba6ee5ded544fa8e6717f590.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_977137b371df841993c8d0584be7d83aca6add78.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_977137b371df841993c8d0584be7d83aca6add78.hip deleted file mode 100644 index b8418a6d5351..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_977137b371df841993c8d0584be7d83aca6add78.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97851d5ecbf02f8af623988b1a39c0b91e51533a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97851d5ecbf02f8af623988b1a39c0b91e51533a.hip deleted file mode 100644 index 044cd6b8ed27..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_97851d5ecbf02f8af623988b1a39c0b91e51533a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9801b25e0f132d647934deb395b62a3f70cc7c88.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9801b25e0f132d647934deb395b62a3f70cc7c88.hip deleted file mode 100644 index e3b12b627879..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9801b25e0f132d647934deb395b62a3f70cc7c88.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987a617fae00fa90a1ba60937b0312c81087c19e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987a617fae00fa90a1ba60937b0312c81087c19e.hip deleted file mode 100644 index 4cd907c27014..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987a617fae00fa90a1ba60937b0312c81087c19e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987f00dd759d9714693e7517dfaa8bb427294d42.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987f00dd759d9714693e7517dfaa8bb427294d42.hip deleted file mode 100644 index 05e2906409fa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_987f00dd759d9714693e7517dfaa8bb427294d42.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9893336a4b00b2a63f23ed7e13ec54c82d9e5063.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9893336a4b00b2a63f23ed7e13ec54c82d9e5063.hip deleted file mode 100644 index 2c14aa3dfe80..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9893336a4b00b2a63f23ed7e13ec54c82d9e5063.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98e484adeddf3394d8d7693b808d83b64c71ee69.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98e484adeddf3394d8d7693b808d83b64c71ee69.hip deleted file mode 100644 index 73f1520faa2b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98e484adeddf3394d8d7693b808d83b64c71ee69.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f5efcd500ce6b9ffc14bc9877e0ba457539925.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f5efcd500ce6b9ffc14bc9877e0ba457539925.hip deleted file mode 100644 index ea866c827c29..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f5efcd500ce6b9ffc14bc9877e0ba457539925.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f9a4f4d85f292b78123599a2e1798f12aa545b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f9a4f4d85f292b78123599a2e1798f12aa545b.hip deleted file mode 100644 index da636b8e344f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_98f9a4f4d85f292b78123599a2e1798f12aa545b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9990e6ad243a48b84304b5cad0c663c0802aedfd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9990e6ad243a48b84304b5cad0c663c0802aedfd.hip deleted file mode 100644 index 3be0fb6df1ac..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9990e6ad243a48b84304b5cad0c663c0802aedfd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99ae680eed89ea93a3a94586bd5a68dbc5439f37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99ae680eed89ea93a3a94586bd5a68dbc5439f37.hip deleted file mode 100644 index 420137406e6d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99ae680eed89ea93a3a94586bd5a68dbc5439f37.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99e2f290b962f1617b0a9d4fd6d55c43e4439d6f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99e2f290b962f1617b0a9d4fd6d55c43e4439d6f.hip deleted file mode 100644 index 145cf83ea4bd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99e2f290b962f1617b0a9d4fd6d55c43e4439d6f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99f8352674bd6bbe98944a1c0a769a4fc028a623.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99f8352674bd6bbe98944a1c0a769a4fc028a623.hip deleted file mode 100644 index ee4361de8718..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_99f8352674bd6bbe98944a1c0a769a4fc028a623.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a0a70932bd587759df1e5e150b25b0126d7b529.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a0a70932bd587759df1e5e150b25b0126d7b529.hip deleted file mode 100644 index af0ffa213778..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a0a70932bd587759df1e5e150b25b0126d7b529.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a20fa19d8d30654602e363806f559113218d66d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a20fa19d8d30654602e363806f559113218d66d.hip deleted file mode 100644 index c9a76092cd76..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a20fa19d8d30654602e363806f559113218d66d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a8e04fe9432a60f86ff0369e8c1851821074a04.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a8e04fe9432a60f86ff0369e8c1851821074a04.hip deleted file mode 100644 index d596d50779bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a8e04fe9432a60f86ff0369e8c1851821074a04.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a9edbe35a8fac7796f00bde836bd547044770ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a9edbe35a8fac7796f00bde836bd547044770ea.hip deleted file mode 100644 index e5c9c1d2d93d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9a9edbe35a8fac7796f00bde836bd547044770ea.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ab73ea77ec20ea3bfaf995dacf93a6960ecdca0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ab73ea77ec20ea3bfaf995dacf93a6960ecdca0.hip deleted file mode 100644 index e973f986deb3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ab73ea77ec20ea3bfaf995dacf93a6960ecdca0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ad1f99284aafc8d7908d062f179a056eb314925.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ad1f99284aafc8d7908d062f179a056eb314925.hip deleted file mode 100644 index 95bf476babcb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ad1f99284aafc8d7908d062f179a056eb314925.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ae866c7db36286876818bfb718ac35204fa3843.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ae866c7db36286876818bfb718ac35204fa3843.hip deleted file mode 100644 index a15c18862336..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ae866c7db36286876818bfb718ac35204fa3843.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9afe4b6f3b901ff4af81bd4f1cd8ff19f09d0b07.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9afe4b6f3b901ff4af81bd4f1cd8ff19f09d0b07.hip deleted file mode 100644 index d0e5277b6f85..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9afe4b6f3b901ff4af81bd4f1cd8ff19f09d0b07.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b062dd633645772e4f2caffd111af73184f7657.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b062dd633645772e4f2caffd111af73184f7657.hip deleted file mode 100644 index dc6a112cc5f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b062dd633645772e4f2caffd111af73184f7657.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b327f0fa1155f2235d76be45cd22e3db5a69429.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b327f0fa1155f2235d76be45cd22e3db5a69429.hip deleted file mode 100644 index 064c9fcbd8fa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b327f0fa1155f2235d76be45cd22e3db5a69429.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b4dcde1ae3446b825dea739d4295c1d1ec5c4be.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b4dcde1ae3446b825dea739d4295c1d1ec5c4be.hip deleted file mode 100644 index d566a2583a3c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b4dcde1ae3446b825dea739d4295c1d1ec5c4be.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b6d08e63b9a90f2524cbfa8c5fcf8b82a1d2d36.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b6d08e63b9a90f2524cbfa8c5fcf8b82a1d2d36.hip deleted file mode 100644 index 50f7dfc8afb3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b6d08e63b9a90f2524cbfa8c5fcf8b82a1d2d36.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b73c92a13757877f34bd8a13c6fb29b60999020.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b73c92a13757877f34bd8a13c6fb29b60999020.hip deleted file mode 100644 index 1ccea0d2aec3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b73c92a13757877f34bd8a13c6fb29b60999020.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b841b7cf5da31f0c30ec42c91cc8d5bd3fedd03.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b841b7cf5da31f0c30ec42c91cc8d5bd3fedd03.hip deleted file mode 100644 index 3eb3770ae423..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9b841b7cf5da31f0c30ec42c91cc8d5bd3fedd03.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bcc791049e3ff9ebc1a9085d2d20efcc2f99b71.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bcc791049e3ff9ebc1a9085d2d20efcc2f99b71.hip deleted file mode 100644 index aec0230d741b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bcc791049e3ff9ebc1a9085d2d20efcc2f99b71.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bf235679af1ca03a6e601b4cf6cd0416d1c9091.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bf235679af1ca03a6e601b4cf6cd0416d1c9091.hip deleted file mode 100644 index 82c5b3f6119d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9bf235679af1ca03a6e601b4cf6cd0416d1c9091.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9c4fc7cda4b560040cec93f63021b529aa1ee3fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9c4fc7cda4b560040cec93f63021b529aa1ee3fd.hip deleted file mode 100644 index 9cd1460c12a5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9c4fc7cda4b560040cec93f63021b529aa1ee3fd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ca3b1d36d777213eb381b47871bf15dd163c994.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ca3b1d36d777213eb381b47871bf15dd163c994.hip deleted file mode 100644 index c99e8d61b177..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9ca3b1d36d777213eb381b47871bf15dd163c994.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9cc3ef3d3b36f52089548e9dce522b0448e2c26a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9cc3ef3d3b36f52089548e9dce522b0448e2c26a.hip deleted file mode 100644 index ee1287b43f36..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9cc3ef3d3b36f52089548e9dce522b0448e2c26a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d3d274058bc0a3d4d35d90669587761fdfbdba1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d3d274058bc0a3d4d35d90669587761fdfbdba1.hip deleted file mode 100644 index 1f64d9734823..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d3d274058bc0a3d4d35d90669587761fdfbdba1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d6759d8855c4c6289f1f241a1628cf0406c1b64.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d6759d8855c4c6289f1f241a1628cf0406c1b64.hip deleted file mode 100644 index 588cc32456d7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d6759d8855c4c6289f1f241a1628cf0406c1b64.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d69d441f48f9ea346dd8e00376a9a708da3ad87.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d69d441f48f9ea346dd8e00376a9a708da3ad87.hip deleted file mode 100644 index 11a8723145a9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9d69d441f48f9ea346dd8e00376a9a708da3ad87.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9dc424f0e192155e3c4e786e5b87d5a1a3e6c4ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9dc424f0e192155e3c4e786e5b87d5a1a3e6c4ad.hip deleted file mode 100644 index 2cbd253a36a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9dc424f0e192155e3c4e786e5b87d5a1a3e6c4ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9e51083e13aa4dfa8c969f8f916835a8e5e9ca39.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9e51083e13aa4dfa8c969f8f916835a8e5e9ca39.hip deleted file mode 100644 index 3cfc1ba422ed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9e51083e13aa4dfa8c969f8f916835a8e5e9ca39.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9eef1b54d5d3841f3fa6b84cca6c7ad33efa2d9f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9eef1b54d5d3841f3fa6b84cca6c7ad33efa2d9f.hip deleted file mode 100644 index 3b4f0f7ad35e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9eef1b54d5d3841f3fa6b84cca6c7ad33efa2d9f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9f0517550c7a23882b95de451e8099ea2186b4ce.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9f0517550c7a23882b95de451e8099ea2186b4ce.hip deleted file mode 100644 index 4d63d6f0b935..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9f0517550c7a23882b95de451e8099ea2186b4ce.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9fb389d4b5ba590baa951f17da06f0e53d2bfa55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9fb389d4b5ba590baa951f17da06f0e53d2bfa55.hip deleted file mode 100644 index 3f741d34a0fb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_9fb389d4b5ba590baa951f17da06f0e53d2bfa55.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a017be7b8bcf303b30a147f41346898acc5fab7d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a017be7b8bcf303b30a147f41346898acc5fab7d.hip deleted file mode 100644 index ffd897040dd7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a017be7b8bcf303b30a147f41346898acc5fab7d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02a71fdd587e47ee68e0cc76c3c4494ce06c359.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02a71fdd587e47ee68e0cc76c3c4494ce06c359.hip deleted file mode 100644 index 7ca210ee7e1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02a71fdd587e47ee68e0cc76c3c4494ce06c359.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02f152e9184af0b3d77082d8bdf519dbbfceb2d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02f152e9184af0b3d77082d8bdf519dbbfceb2d.hip deleted file mode 100644 index e4aa5e9c9a9d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a02f152e9184af0b3d77082d8bdf519dbbfceb2d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a046e888e3836b0bd3c49fec8e1872e880798f0c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a046e888e3836b0bd3c49fec8e1872e880798f0c.hip deleted file mode 100644 index 371c3c466073..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a046e888e3836b0bd3c49fec8e1872e880798f0c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0874fc5ac87a1ec487c7722bf3b1bdaa924ee09.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0874fc5ac87a1ec487c7722bf3b1bdaa924ee09.hip deleted file mode 100644 index 4524055af0e3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0874fc5ac87a1ec487c7722bf3b1bdaa924ee09.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a094599fb5caf5e7aba728cd4713a8d0c6368a46.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a094599fb5caf5e7aba728cd4713a8d0c6368a46.hip deleted file mode 100644 index 5802a84a0e0b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a094599fb5caf5e7aba728cd4713a8d0c6368a46.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0a556c9358ddd6db719458c81d2d6d822a895da.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0a556c9358ddd6db719458c81d2d6d822a895da.hip deleted file mode 100644 index 68adac38978a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a0a556c9358ddd6db719458c81d2d6d822a895da.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a103cd47156a98ad2cf2c325ea00df3f1d67fb72.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a103cd47156a98ad2cf2c325ea00df3f1d67fb72.hip deleted file mode 100644 index 673ecb9ada48..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a103cd47156a98ad2cf2c325ea00df3f1d67fb72.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a189292c81a18d21a2921ce6740f81ebf4c046ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a189292c81a18d21a2921ce6740f81ebf4c046ad.hip deleted file mode 100644 index 7214c13980e8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a189292c81a18d21a2921ce6740f81ebf4c046ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1c71e7d33f0597fe090a3524e33e18b2e562680.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1c71e7d33f0597fe090a3524e33e18b2e562680.hip deleted file mode 100644 index 110d3ba13fea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1c71e7d33f0597fe090a3524e33e18b2e562680.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1cba1509c413c870c5d784410855ee1bd737da2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1cba1509c413c870c5d784410855ee1bd737da2.hip deleted file mode 100644 index 0955208d8ab0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1cba1509c413c870c5d784410855ee1bd737da2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1d6ad9de7ac7993ae1923a2ef070b7dacb8c563.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1d6ad9de7ac7993ae1923a2ef070b7dacb8c563.hip deleted file mode 100644 index af8d021fa7be..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a1d6ad9de7ac7993ae1923a2ef070b7dacb8c563.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a20c91b2f11bb7e5058ca7935b0bda4f5558a9dc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a20c91b2f11bb7e5058ca7935b0bda4f5558a9dc.hip deleted file mode 100644 index 3f80acb260ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a20c91b2f11bb7e5058ca7935b0bda4f5558a9dc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a21f3637624762547af1292e1b85e640b1d329dc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a21f3637624762547af1292e1b85e640b1d329dc.hip deleted file mode 100644 index d26979e2d5d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a21f3637624762547af1292e1b85e640b1d329dc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a225c4f1f3c7b271957768bb9235131c67afb48a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a225c4f1f3c7b271957768bb9235131c67afb48a.hip deleted file mode 100644 index 061afd23dc9f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a225c4f1f3c7b271957768bb9235131c67afb48a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2482a64659c838f3da55f56e3cbbee1dbfe6722.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2482a64659c838f3da55f56e3cbbee1dbfe6722.hip deleted file mode 100644 index 3321e8a41a54..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2482a64659c838f3da55f56e3cbbee1dbfe6722.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a25e2aed617e1ff31f93ae7e054313ee0dceee97.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a25e2aed617e1ff31f93ae7e054313ee0dceee97.hip deleted file mode 100644 index 592870480fd8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a25e2aed617e1ff31f93ae7e054313ee0dceee97.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2a715b7e9c1a576f011dfe5769c5b392e984f82.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2a715b7e9c1a576f011dfe5769c5b392e984f82.hip deleted file mode 100644 index bb0514413888..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2a715b7e9c1a576f011dfe5769c5b392e984f82.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2ef5d30a2318ae06430d17f84878800c4ca7364.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2ef5d30a2318ae06430d17f84878800c4ca7364.hip deleted file mode 100644 index 566198c7f81e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a2ef5d30a2318ae06430d17f84878800c4ca7364.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3339150d8bf9d073827738527f6cbe15b854607.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3339150d8bf9d073827738527f6cbe15b854607.hip deleted file mode 100644 index 21c141e4088b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3339150d8bf9d073827738527f6cbe15b854607.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3709e4fc53d2254a03ea7660b8c72d2f47cf1ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3709e4fc53d2254a03ea7660b8c72d2f47cf1ad.hip deleted file mode 100644 index e46b8bbff686..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3709e4fc53d2254a03ea7660b8c72d2f47cf1ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a388a284f45f711d82a6ed87036d87cef1872eb1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a388a284f45f711d82a6ed87036d87cef1872eb1.hip deleted file mode 100644 index 7c7b7b8746cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a388a284f45f711d82a6ed87036d87cef1872eb1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ac4f93722dc314086f1b7d7b8adc687cd75f82.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ac4f93722dc314086f1b7d7b8adc687cd75f82.hip deleted file mode 100644 index 1d0892ee5bd9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ac4f93722dc314086f1b7d7b8adc687cd75f82.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3d7aa46528ee74e2bef1e87c1feceacfa55e173.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3d7aa46528ee74e2bef1e87c1feceacfa55e173.hip deleted file mode 100644 index a1a71baf2510..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3d7aa46528ee74e2bef1e87c1feceacfa55e173.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3dc780b17152f696f9b957432c2eae8fb16e85e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3dc780b17152f696f9b957432c2eae8fb16e85e.hip deleted file mode 100644 index 780d9b66c834..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3dc780b17152f696f9b957432c2eae8fb16e85e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3f9c236d24b30bc9c3fad90cfd6eb00da835de2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3f9c236d24b30bc9c3fad90cfd6eb00da835de2.hip deleted file mode 100644 index 7bb86590805b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3f9c236d24b30bc9c3fad90cfd6eb00da835de2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ff8445ba691807caadd9f26e7eb90851875280.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ff8445ba691807caadd9f26e7eb90851875280.hip deleted file mode 100644 index 8964c1287266..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a3ff8445ba691807caadd9f26e7eb90851875280.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a421c2ed6b295c458071f1988b9d6f7b46e8992c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a421c2ed6b295c458071f1988b9d6f7b46e8992c.hip deleted file mode 100644 index 55057543df91..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a421c2ed6b295c458071f1988b9d6f7b46e8992c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4700d87a19a173e84d64e43cffabbed52366e35.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4700d87a19a173e84d64e43cffabbed52366e35.hip deleted file mode 100644 index b3775081f0aa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4700d87a19a173e84d64e43cffabbed52366e35.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a487f617c4b84c6a0328fedac750d41dc3dafe27.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a487f617c4b84c6a0328fedac750d41dc3dafe27.hip deleted file mode 100644 index 0fe779c20a88..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a487f617c4b84c6a0328fedac750d41dc3dafe27.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a48843d844f78690c7a45b730652f0f763c595c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a48843d844f78690c7a45b730652f0f763c595c7.hip deleted file mode 100644 index e762047290ef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a48843d844f78690c7a45b730652f0f763c595c7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4980becb0d3149fee575bad1fc3b463d08aabf5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4980becb0d3149fee575bad1fc3b463d08aabf5.hip deleted file mode 100644 index 25037c1333b1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4980becb0d3149fee575bad1fc3b463d08aabf5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4b7f10440331a8a88ff93ba253217c2832bcf9e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4b7f10440331a8a88ff93ba253217c2832bcf9e.hip deleted file mode 100644 index 781310164542..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a4b7f10440331a8a88ff93ba253217c2832bcf9e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55b47aafc4340e69e300ac61a7601a5c14513b7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55b47aafc4340e69e300ac61a7601a5c14513b7.hip deleted file mode 100644 index df41b3877243..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55b47aafc4340e69e300ac61a7601a5c14513b7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55c7dd576e5b1061c059e5e99aeedf4389e2d25.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55c7dd576e5b1061c059e5e99aeedf4389e2d25.hip deleted file mode 100644 index a830e93c2d46..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a55c7dd576e5b1061c059e5e99aeedf4389e2d25.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a59423c095db052603d77073d409534bceef425f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a59423c095db052603d77073d409534bceef425f.hip deleted file mode 100644 index e51048b4fc90..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a59423c095db052603d77073d409534bceef425f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5a7833f4597bb03a3e845d5580d677e97421040.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5a7833f4597bb03a3e845d5580d677e97421040.hip deleted file mode 100644 index fc3fea65dc68..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5a7833f4597bb03a3e845d5580d677e97421040.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5bdc110955c05c6c6ea236a6f60266a4a6dce5e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5bdc110955c05c6c6ea236a6f60266a4a6dce5e.hip deleted file mode 100644 index fe03517034b7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5bdc110955c05c6c6ea236a6f60266a4a6dce5e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c0109313de1f6245d2a80f8539485b849e9d55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c0109313de1f6245d2a80f8539485b849e9d55.hip deleted file mode 100644 index 7e8564220a37..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c0109313de1f6245d2a80f8539485b849e9d55.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c4dc0d70c547dbbfb661e879ba7f9adfafc2ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c4dc0d70c547dbbfb661e879ba7f9adfafc2ea.hip deleted file mode 100644 index acd7584593b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5c4dc0d70c547dbbfb661e879ba7f9adfafc2ea.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5d4eb673bafd81e3a0ee213da4603d88b8460ec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5d4eb673bafd81e3a0ee213da4603d88b8460ec.hip deleted file mode 100644 index 3bf976b5b2e8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5d4eb673bafd81e3a0ee213da4603d88b8460ec.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5e5cae764142683b70d3344cf07dd1edb7d69e2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5e5cae764142683b70d3344cf07dd1edb7d69e2.hip deleted file mode 100644 index bcc20d9d3832..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5e5cae764142683b70d3344cf07dd1edb7d69e2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f2f0cef657ae5e333d65ae4ab20529a43cd7de.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f2f0cef657ae5e333d65ae4ab20529a43cd7de.hip deleted file mode 100644 index 8788ff825538..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f2f0cef657ae5e333d65ae4ab20529a43cd7de.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f8b7b2a891aa9f2ab49762eb31d835efdf18b6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f8b7b2a891aa9f2ab49762eb31d835efdf18b6.hip deleted file mode 100644 index f72f0eb7cf7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5f8b7b2a891aa9f2ab49762eb31d835efdf18b6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5fa94bb32a80e81886b711ebfcf2df5f5405866.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5fa94bb32a80e81886b711ebfcf2df5f5405866.hip deleted file mode 100644 index c04f2790cb06..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a5fa94bb32a80e81886b711ebfcf2df5f5405866.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a622fa57764ec746e02f6d4bd4846b48c722b807.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a622fa57764ec746e02f6d4bd4846b48c722b807.hip deleted file mode 100644 index 59dc4d1674c2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a622fa57764ec746e02f6d4bd4846b48c722b807.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a62a2ab489839ea1a1bfd1b24e54a3c232ed934f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a62a2ab489839ea1a1bfd1b24e54a3c232ed934f.hip deleted file mode 100644 index 7f13ed4266c5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a62a2ab489839ea1a1bfd1b24e54a3c232ed934f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a6461d72fb6ba50e81de3f661528c96dcfdc3f3c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a6461d72fb6ba50e81de3f661528c96dcfdc3f3c.hip deleted file mode 100644 index 0037c2b61515..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a6461d72fb6ba50e81de3f661528c96dcfdc3f3c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a64b4cf3f6706e4b4e0af4402e2263b9a1585f9b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a64b4cf3f6706e4b4e0af4402e2263b9a1585f9b.hip deleted file mode 100644 index bdcd8611d9cb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a64b4cf3f6706e4b4e0af4402e2263b9a1585f9b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a65c43b870705c780d734f9ef063f55cf8b3b52d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a65c43b870705c780d734f9ef063f55cf8b3b52d.hip deleted file mode 100644 index af5d24650029..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a65c43b870705c780d734f9ef063f55cf8b3b52d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a673f35edd69241c6b921d6712dfd064d78ecbad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a673f35edd69241c6b921d6712dfd064d78ecbad.hip deleted file mode 100644 index a8bf1bced5bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a673f35edd69241c6b921d6712dfd064d78ecbad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a71305f191f06cd53b7563971c706e8b71b19e2f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a71305f191f06cd53b7563971c706e8b71b19e2f.hip deleted file mode 100644 index 54c9e1be5615..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a71305f191f06cd53b7563971c706e8b71b19e2f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a74b0e7dd816ad08eec5a1bba6e227afee9813ec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a74b0e7dd816ad08eec5a1bba6e227afee9813ec.hip deleted file mode 100644 index cb6605dd510f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a74b0e7dd816ad08eec5a1bba6e227afee9813ec.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7784b03ad757d51c234fa86ea9891f055ecd5c1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7784b03ad757d51c234fa86ea9891f055ecd5c1.hip deleted file mode 100644 index 7e4ac2125a72..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7784b03ad757d51c234fa86ea9891f055ecd5c1.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a78fecb9725ceb4bcf2aa037d43bc43efeb1c3fd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a78fecb9725ceb4bcf2aa037d43bc43efeb1c3fd.hip deleted file mode 100644 index 829a327af50d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a78fecb9725ceb4bcf2aa037d43bc43efeb1c3fd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7f7553a7d2f6d42fe695cdc64423c85223af440.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7f7553a7d2f6d42fe695cdc64423c85223af440.hip deleted file mode 100644 index f35e0dd453d7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a7f7553a7d2f6d42fe695cdc64423c85223af440.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a821661d8280c6e9d27f2c9ce1b3c855387b5a76.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a821661d8280c6e9d27f2c9ce1b3c855387b5a76.hip deleted file mode 100644 index 271d75768932..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a821661d8280c6e9d27f2c9ce1b3c855387b5a76.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a85d35b2fd98742427930eb536e346ffb005edd8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a85d35b2fd98742427930eb536e346ffb005edd8.hip deleted file mode 100644 index 61b95a2c1454..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a85d35b2fd98742427930eb536e346ffb005edd8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a4af070ee46d802cb11086b93daf91538f8a04.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a4af070ee46d802cb11086b93daf91538f8a04.hip deleted file mode 100644 index 2fccdb851edf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a4af070ee46d802cb11086b93daf91538f8a04.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a744edfa3a19d1493611df5bd0d4d59b707d43.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a744edfa3a19d1493611df5bd0d4d59b707d43.hip deleted file mode 100644 index 1b0e1a78094f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a8a744edfa3a19d1493611df5bd0d4d59b707d43.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a92b43d374642df991edef1f6036dc898bf77cf8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a92b43d374642df991edef1f6036dc898bf77cf8.hip deleted file mode 100644 index 578fdbdb463d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a92b43d374642df991edef1f6036dc898bf77cf8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93324ccf11b273ed20fd960c61df897c8890b1d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93324ccf11b273ed20fd960c61df897c8890b1d.hip deleted file mode 100644 index ad910a4ad950..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93324ccf11b273ed20fd960c61df897c8890b1d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93a03b33305b33055273711ab31a5b8d8298d5d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93a03b33305b33055273711ab31a5b8d8298d5d.hip deleted file mode 100644 index 4127c092ffbd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a93a03b33305b33055273711ab31a5b8d8298d5d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a968df29f5ae1463706b7981b3bde55918e1aa65.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a968df29f5ae1463706b7981b3bde55918e1aa65.hip deleted file mode 100644 index b5ab69da5f61..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a968df29f5ae1463706b7981b3bde55918e1aa65.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a98925d99dc484da41dd55700e151cf545cf821d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a98925d99dc484da41dd55700e151cf545cf821d.hip deleted file mode 100644 index 5966147a0b5f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a98925d99dc484da41dd55700e151cf545cf821d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9b50c6ebb27986ce5b378d8c39315eb9cb91dea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9b50c6ebb27986ce5b378d8c39315eb9cb91dea.hip deleted file mode 100644 index dbbdbaaf80ba..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9b50c6ebb27986ce5b378d8c39315eb9cb91dea.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9d2be18e2d53a5144f97dfdebb225fcb6d611d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9d2be18e2d53a5144f97dfdebb225fcb6d611d3.hip deleted file mode 100644 index 8d7cdebbd730..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9d2be18e2d53a5144f97dfdebb225fcb6d611d3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9df9ac4ee78e5f4d5bd0567e58a7090907c61e1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9df9ac4ee78e5f4d5bd0567e58a7090907c61e1.hip deleted file mode 100644 index f91d5eac3fa5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9df9ac4ee78e5f4d5bd0567e58a7090907c61e1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9f00f270680de81df7737e848e0408cb070e68b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9f00f270680de81df7737e848e0408cb070e68b.hip deleted file mode 100644 index 73efa28b3162..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_a9f00f270680de81df7737e848e0408cb070e68b.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, false, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa1041530f794c7b8dc4a8321ea0fcdd338fff35.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa1041530f794c7b8dc4a8321ea0fcdd338fff35.hip deleted file mode 100644 index f384c617e3ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa1041530f794c7b8dc4a8321ea0fcdd338fff35.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa522b43c5e5ea69bcabb4c0fe28def2bd081a12.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa522b43c5e5ea69bcabb4c0fe28def2bd081a12.hip deleted file mode 100644 index 328a9c14353e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa522b43c5e5ea69bcabb4c0fe28def2bd081a12.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa6d13b09f85ee62bb5018608812181fb43afc86.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa6d13b09f85ee62bb5018608812181fb43afc86.hip deleted file mode 100644 index 55cd7f989991..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa6d13b09f85ee62bb5018608812181fb43afc86.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa82d20635e592edbf00439294835f6f39ad54a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa82d20635e592edbf00439294835f6f39ad54a3.hip deleted file mode 100644 index cef0937d502c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa82d20635e592edbf00439294835f6f39ad54a3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa996b9c843200a2ec33ed4319b48106cd7c6384.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa996b9c843200a2ec33ed4319b48106cd7c6384.hip deleted file mode 100644 index 9a23fd3a853a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aa996b9c843200a2ec33ed4319b48106cd7c6384.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aafe891dad43815e635f81225705ff944f990d75.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aafe891dad43815e635f81225705ff944f990d75.hip deleted file mode 100644 index 655ee83d8e96..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aafe891dad43815e635f81225705ff944f990d75.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab09941bddfa9d61985b55f9b6bf0edec9bb89f6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab09941bddfa9d61985b55f9b6bf0edec9bb89f6.hip deleted file mode 100644 index d9b5508c76d9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab09941bddfa9d61985b55f9b6bf0edec9bb89f6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0be5a2072b5e87f5ee58149688796b6513219f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0be5a2072b5e87f5ee58149688796b6513219f.hip deleted file mode 100644 index 586302a0693a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0be5a2072b5e87f5ee58149688796b6513219f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0c3fe9529e24327686070731d0ac3ada76245e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0c3fe9529e24327686070731d0ac3ada76245e.hip deleted file mode 100644 index 2f9ca5ec1d7a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab0c3fe9529e24327686070731d0ac3ada76245e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1ca4ce061f7f69a250356f613cab00d1e2ac71.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1ca4ce061f7f69a250356f613cab00d1e2ac71.hip deleted file mode 100644 index ee98cd327b7a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1ca4ce061f7f69a250356f613cab00d1e2ac71.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1d7f93427095e39bfc1d986b3d7fe54073ec75.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1d7f93427095e39bfc1d986b3d7fe54073ec75.hip deleted file mode 100644 index cd3f04da24ec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab1d7f93427095e39bfc1d986b3d7fe54073ec75.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab43f4a56c166dad0113f51b337a083f4df7cdb6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab43f4a56c166dad0113f51b337a083f4df7cdb6.hip deleted file mode 100644 index 8a79d192d547..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab43f4a56c166dad0113f51b337a083f4df7cdb6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab56e886d53a1d88fada0f10f00b9f398dc54568.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab56e886d53a1d88fada0f10f00b9f398dc54568.hip deleted file mode 100644 index 3eadd1bd28e5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab56e886d53a1d88fada0f10f00b9f398dc54568.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab6cd5c9242f8278c8f3d9ce57b97d605c7e5a3e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab6cd5c9242f8278c8f3d9ce57b97d605c7e5a3e.hip deleted file mode 100644 index e7d75ae4cd2d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab6cd5c9242f8278c8f3d9ce57b97d605c7e5a3e.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab877ae2a1aab04498bf2b26b3fe99d6488ef151.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab877ae2a1aab04498bf2b26b3fe99d6488ef151.hip deleted file mode 100644 index 9734f97bd608..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ab877ae2a1aab04498bf2b26b3fe99d6488ef151.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf6c6412f9853855b74a96e862935ddef66f763.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf6c6412f9853855b74a96e862935ddef66f763.hip deleted file mode 100644 index afa30674780f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf6c6412f9853855b74a96e862935ddef66f763.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf92a5314fd33491b5eb6ebd2418b7e0d5db774.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf92a5314fd33491b5eb6ebd2418b7e0d5db774.hip deleted file mode 100644 index 22742e24afbe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_abf92a5314fd33491b5eb6ebd2418b7e0d5db774.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac1ccde31b47e0e56ee0daab6403fed7895208c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac1ccde31b47e0e56ee0daab6403fed7895208c7.hip deleted file mode 100644 index 5600e3189fbd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac1ccde31b47e0e56ee0daab6403fed7895208c7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac5e9aee85cd16903bf7b82a4ac10402b0b26e22.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac5e9aee85cd16903bf7b82a4ac10402b0b26e22.hip deleted file mode 100644 index e2cf68e6382a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac5e9aee85cd16903bf7b82a4ac10402b0b26e22.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac9382cf8bb56ffd962c99329bf67da992f8810d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac9382cf8bb56ffd962c99329bf67da992f8810d.hip deleted file mode 100644 index f7c405f44d87..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ac9382cf8bb56ffd962c99329bf67da992f8810d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aceb0641213e9a45ba48bcf72bb23845720d8b79.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aceb0641213e9a45ba48bcf72bb23845720d8b79.hip deleted file mode 100644 index 5d50b3f96963..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aceb0641213e9a45ba48bcf72bb23845720d8b79.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad091c69d19b27f7ad50ef6311532ad8b642a9c6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad091c69d19b27f7ad50ef6311532ad8b642a9c6.hip deleted file mode 100644 index c45321177eab..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad091c69d19b27f7ad50ef6311532ad8b642a9c6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad82071cc074fd30437f6158b5eb2c6df1f8c587.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad82071cc074fd30437f6158b5eb2c6df1f8c587.hip deleted file mode 100644 index 2a409b3c1191..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad82071cc074fd30437f6158b5eb2c6df1f8c587.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad989d2ce769f20e175fa88f4082c1c25fe03062.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad989d2ce769f20e175fa88f4082c1c25fe03062.hip deleted file mode 100644 index 2e5b2e2ffda9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad989d2ce769f20e175fa88f4082c1c25fe03062.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad9b99a194b59d3149842c15733394da275b12c0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad9b99a194b59d3149842c15733394da275b12c0.hip deleted file mode 100644 index bd65652c940d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ad9b99a194b59d3149842c15733394da275b12c0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ada016be2bd0e377fbe01fa7adb9bbb8febce100.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ada016be2bd0e377fbe01fa7adb9bbb8febce100.hip deleted file mode 100644 index b70e9584be9b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ada016be2bd0e377fbe01fa7adb9bbb8febce100.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adae2d4f8b2dac799e03ea6f279e6ecdf66f5381.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adae2d4f8b2dac799e03ea6f279e6ecdf66f5381.hip deleted file mode 100644 index 27b5bb9e27ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adae2d4f8b2dac799e03ea6f279e6ecdf66f5381.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adaef10ff2c5d89530310bdf1d53a194f06a94ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adaef10ff2c5d89530310bdf1d53a194f06a94ef.hip deleted file mode 100644 index b842623af735..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adaef10ff2c5d89530310bdf1d53a194f06a94ef.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_add29e3e9828911a117dccaa5650e77805730d14.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_add29e3e9828911a117dccaa5650e77805730d14.hip deleted file mode 100644 index 71ed5a679714..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_add29e3e9828911a117dccaa5650e77805730d14.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adda7ad787524e3e47dcc1b65c41b2faea38f55f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adda7ad787524e3e47dcc1b65c41b2faea38f55f.hip deleted file mode 100644 index 092b0e389fed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adda7ad787524e3e47dcc1b65c41b2faea38f55f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_addb6a14043c5a4df0f5042b3770b40c4e90795c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_addb6a14043c5a4df0f5042b3770b40c4e90795c.hip deleted file mode 100644 index 396e283283b7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_addb6a14043c5a4df0f5042b3770b40c4e90795c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adf160741a4f751d2f15d6eb23d4121cdca62b55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adf160741a4f751d2f15d6eb23d4121cdca62b55.hip deleted file mode 100644 index 961d81dcbdc8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_adf160741a4f751d2f15d6eb23d4121cdca62b55.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1ab1f4bbe86bb9bbc22e4774648076c321136f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1ab1f4bbe86bb9bbc22e4774648076c321136f.hip deleted file mode 100644 index 675ad03969cd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1ab1f4bbe86bb9bbc22e4774648076c321136f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1afeb6cfdf860ff08e4c2f11c922fd5bfa621a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1afeb6cfdf860ff08e4c2f11c922fd5bfa621a.hip deleted file mode 100644 index 2a1126b12c36..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae1afeb6cfdf860ff08e4c2f11c922fd5bfa621a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae239476d61f48379754b97f29d7a285cc3192de.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae239476d61f48379754b97f29d7a285cc3192de.hip deleted file mode 100644 index 20f5842fd656..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae239476d61f48379754b97f29d7a285cc3192de.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e7253ad4873576052ec0a9400597bb7975753.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e7253ad4873576052ec0a9400597bb7975753.hip deleted file mode 100644 index 40395e507c4f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e7253ad4873576052ec0a9400597bb7975753.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e80cb185759dd9b3eb3c67c239964b3694caa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e80cb185759dd9b3eb3c67c239964b3694caa.hip deleted file mode 100644 index 7c9715610787..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae4e80cb185759dd9b3eb3c67c239964b3694caa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae51b30c7e1cd30e550187458350c8db7c59a9ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae51b30c7e1cd30e550187458350c8db7c59a9ef.hip deleted file mode 100644 index 9aa4cef1e17c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae51b30c7e1cd30e550187458350c8db7c59a9ef.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae7899b1ef159ecbf01f27014601eb79b31b49b3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae7899b1ef159ecbf01f27014601eb79b31b49b3.hip deleted file mode 100644 index 4b42db971d1d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae7899b1ef159ecbf01f27014601eb79b31b49b3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae87b1d5c50606430b544ed650d87df24366e7d5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae87b1d5c50606430b544ed650d87df24366e7d5.hip deleted file mode 100644 index bf947e99de0f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae87b1d5c50606430b544ed650d87df24366e7d5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae8d0bdde763e617beafc0365ec4a3cd11df6c55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae8d0bdde763e617beafc0365ec4a3cd11df6c55.hip deleted file mode 100644 index 2b3cf7a363bf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ae8d0bdde763e617beafc0365ec4a3cd11df6c55.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebb2441e6cc1ccba4a391566e547402bcf7ced2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebb2441e6cc1ccba4a391566e547402bcf7ced2.hip deleted file mode 100644 index b996d43df67b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebb2441e6cc1ccba4a391566e547402bcf7ced2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebd5fed34ebceb879ae3dffaf58c7c04ab5fe80.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebd5fed34ebceb879ae3dffaf58c7c04ab5fe80.hip deleted file mode 100644 index 63771416681f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebd5fed34ebceb879ae3dffaf58c7c04ab5fe80.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebff7e6605b273bad844b8f70ef031625bff48e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebff7e6605b273bad844b8f70ef031625bff48e.hip deleted file mode 100644 index 18aa32f29a1b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aebff7e6605b273bad844b8f70ef031625bff48e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aec87e65afa93e84d7a947c52f291c1c7360033c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aec87e65afa93e84d7a947c52f291c1c7360033c.hip deleted file mode 100644 index a2e054816ec7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aec87e65afa93e84d7a947c52f291c1c7360033c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aece14f7a220222eb4ce6783ec2b9fce6fde94b8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aece14f7a220222eb4ce6783ec2b9fce6fde94b8.hip deleted file mode 100644 index b65dffe783fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_aece14f7a220222eb4ce6783ec2b9fce6fde94b8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af06c0dae15684f83e15722a4c07342af9ea011c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af06c0dae15684f83e15722a4c07342af9ea011c.hip deleted file mode 100644 index d419f27d1be2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af06c0dae15684f83e15722a4c07342af9ea011c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af6ccfa11add1ae49888337e84d9c446d2f67da4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af6ccfa11add1ae49888337e84d9c446d2f67da4.hip deleted file mode 100644 index 494f7e4bde69..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_af6ccfa11add1ae49888337e84d9c446d2f67da4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afadc4f76e237514db0bc0203102297b79730bd0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afadc4f76e237514db0bc0203102297b79730bd0.hip deleted file mode 100644 index fbdf5cdc2a44..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afadc4f76e237514db0bc0203102297b79730bd0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afc4b47a6fa62a4ca5cff6a7e01c9f6b371d2215.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afc4b47a6fa62a4ca5cff6a7e01c9f6b371d2215.hip deleted file mode 100644 index d2858c19b9eb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afc4b47a6fa62a4ca5cff6a7e01c9f6b371d2215.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afcafd07c1f56e74373ccf37db35976023456d50.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afcafd07c1f56e74373ccf37db35976023456d50.hip deleted file mode 100644 index 3432aa47b55d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afcafd07c1f56e74373ccf37db35976023456d50.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afccf699f593c828e11efc053b144044e45b32d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afccf699f593c828e11efc053b144044e45b32d6.hip deleted file mode 100644 index e32ac0ef383e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afccf699f593c828e11efc053b144044e45b32d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afda8f46b5ded4c2aa9d722fec17b75004b59f7d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afda8f46b5ded4c2aa9d722fec17b75004b59f7d.hip deleted file mode 100644 index 32d191fbc037..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afda8f46b5ded4c2aa9d722fec17b75004b59f7d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afdab954fd111ec48721f25710d61c0c8affd8db.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afdab954fd111ec48721f25710d61c0c8affd8db.hip deleted file mode 100644 index f631f9758668..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_afdab954fd111ec48721f25710d61c0c8affd8db.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b00e062055933388e37525df5766f3c14cd3538a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b00e062055933388e37525df5766f3c14cd3538a.hip deleted file mode 100644 index f6ca9c9f5303..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b00e062055933388e37525df5766f3c14cd3538a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b01dc872c24db4db0c9179fc07e17f41060390de.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b01dc872c24db4db0c9179fc07e17f41060390de.hip deleted file mode 100644 index 479d80756452..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b01dc872c24db4db0c9179fc07e17f41060390de.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b03ab68e33844f97aa58d463e00037bc11c50da0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b03ab68e33844f97aa58d463e00037bc11c50da0.hip deleted file mode 100644 index fb955f754811..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b03ab68e33844f97aa58d463e00037bc11c50da0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b04f14f829eff73afaa57a875f74ebd1e6860979.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b04f14f829eff73afaa57a875f74ebd1e6860979.hip deleted file mode 100644 index a4c291be568f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b04f14f829eff73afaa57a875f74ebd1e6860979.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0544a38dfdf4d81dc95894387845f48435e299a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0544a38dfdf4d81dc95894387845f48435e299a.hip deleted file mode 100644 index d25407ebe728..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0544a38dfdf4d81dc95894387845f48435e299a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0dd965d5d9080ed5c6a04b7eea9890f3a264f20.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0dd965d5d9080ed5c6a04b7eea9890f3a264f20.hip deleted file mode 100644 index 15c3c16d28f7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0dd965d5d9080ed5c6a04b7eea9890f3a264f20.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0f555b74ed36f1bef8f47880b3edc6760f27788.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0f555b74ed36f1bef8f47880b3edc6760f27788.hip deleted file mode 100644 index d419a0e79df2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b0f555b74ed36f1bef8f47880b3edc6760f27788.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1766695dbb790bd614b83dc7569ad449404cc89.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1766695dbb790bd614b83dc7569ad449404cc89.hip deleted file mode 100644 index 5baf6bfebba0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1766695dbb790bd614b83dc7569ad449404cc89.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b18a615e66d7cd739ce35412811359a03cb23a8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b18a615e66d7cd739ce35412811359a03cb23a8e.hip deleted file mode 100644 index e61d86ba0f11..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b18a615e66d7cd739ce35412811359a03cb23a8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b192c55f002d8540d5f965cc4df0c2e33f4b9ff9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b192c55f002d8540d5f965cc4df0c2e33f4b9ff9.hip deleted file mode 100644 index 1d57e0d31c22..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b192c55f002d8540d5f965cc4df0c2e33f4b9ff9.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 64, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<64, ck_tile::bf16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b19f05f6848403480ba41d37cdbf44ccca1b1f8d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b19f05f6848403480ba41d37cdbf44ccca1b1f8d.hip deleted file mode 100644 index c46c86bc35ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b19f05f6848403480ba41d37cdbf44ccca1b1f8d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1ad101ce91348266d3885afdf2996a0fdb72135.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1ad101ce91348266d3885afdf2996a0fdb72135.hip deleted file mode 100644 index adb9abaa0866..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1ad101ce91348266d3885afdf2996a0fdb72135.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1c5d55d47d6038e9162d32ac968ff58c0942938.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1c5d55d47d6038e9162d32ac968ff58c0942938.hip deleted file mode 100644 index 2e9ec621eff0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b1c5d55d47d6038e9162d32ac968ff58c0942938.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20c6252863a73341b0010191fad4c834860f884.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20c6252863a73341b0010191fad4c834860f884.hip deleted file mode 100644 index 6f4bf9142f38..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20c6252863a73341b0010191fad4c834860f884.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20e314642cf565e4f32bceffdb5c0e653ab627b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20e314642cf565e4f32bceffdb5c0e653ab627b.hip deleted file mode 100644 index 32ffb75dcb99..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b20e314642cf565e4f32bceffdb5c0e653ab627b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b24f91dec2029b25d0d96962528410df55a468ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b24f91dec2029b25d0d96962528410df55a468ed.hip deleted file mode 100644 index 5f679a4c3887..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b24f91dec2029b25d0d96962528410df55a468ed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b285e2f1970b78e18002464eeda63798229bbc3a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b285e2f1970b78e18002464eeda63798229bbc3a.hip deleted file mode 100644 index c90705b7782d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b285e2f1970b78e18002464eeda63798229bbc3a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b298e213f927b518c693660110f08bdd94990ef0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b298e213f927b518c693660110f08bdd94990ef0.hip deleted file mode 100644 index 3ec87d74291d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b298e213f927b518c693660110f08bdd94990ef0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2af5f5b5ee3ae964824a3e9c7bbeb5bb39c557c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2af5f5b5ee3ae964824a3e9c7bbeb5bb39c557c.hip deleted file mode 100644 index fcc5641cf9e1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2af5f5b5ee3ae964824a3e9c7bbeb5bb39c557c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2f91e937b427ecc932c0cb0c90b2c2378db0be6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2f91e937b427ecc932c0cb0c90b2c2378db0be6.hip deleted file mode 100644 index 4c74ec252cfd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b2f91e937b427ecc932c0cb0c90b2c2378db0be6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3063d06723ac70c5f8802ab49c5c35e1debf56e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3063d06723ac70c5f8802ab49c5c35e1debf56e.hip deleted file mode 100644 index de835dc163e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3063d06723ac70c5f8802ab49c5c35e1debf56e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b31f56244076c501cb09b4b90975132cae4c4386.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b31f56244076c501cb09b4b90975132cae4c4386.hip deleted file mode 100644 index 23b0e206766b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b31f56244076c501cb09b4b90975132cae4c4386.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3486244e0b7d6dbcaa1951e8b8883ce441c3f99.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3486244e0b7d6dbcaa1951e8b8883ce441c3f99.hip deleted file mode 100644 index dcd7724f0534..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3486244e0b7d6dbcaa1951e8b8883ce441c3f99.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b34c1ce348c3d9cdf6bbec9758de9d5fe94c43fc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b34c1ce348c3d9cdf6bbec9758de9d5fe94c43fc.hip deleted file mode 100644 index 4736e14de6ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b34c1ce348c3d9cdf6bbec9758de9d5fe94c43fc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b38a1d3cffae01332a3a9d9472ff1b2c443e82af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b38a1d3cffae01332a3a9d9472ff1b2c443e82af.hip deleted file mode 100644 index f76a1230b485..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b38a1d3cffae01332a3a9d9472ff1b2c443e82af.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3a104733f678193068d8642d6560faa03897258.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3a104733f678193068d8642d6560faa03897258.hip deleted file mode 100644 index 619619939d99..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3a104733f678193068d8642d6560faa03897258.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3da22d3482738a8474ae15e8e5fca9020c4e195.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3da22d3482738a8474ae15e8e5fca9020c4e195.hip deleted file mode 100644 index 3cacc0f7f130..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b3da22d3482738a8474ae15e8e5fca9020c4e195.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41735d250b5a16967281a5f07873b9cde3df4d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41735d250b5a16967281a5f07873b9cde3df4d6.hip deleted file mode 100644 index 104c9ad00c8b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41735d250b5a16967281a5f07873b9cde3df4d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41a30092e8138877c1f6c25656e0f8ae2c2444e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41a30092e8138877c1f6c25656e0f8ae2c2444e.hip deleted file mode 100644 index 48ec2a43608b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41a30092e8138877c1f6c25656e0f8ae2c2444e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41ea5293bc1c56efa2c4b5681d965aa6f2ce6c3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41ea5293bc1c56efa2c4b5681d965aa6f2ce6c3.hip deleted file mode 100644 index af9fc1e76a56..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b41ea5293bc1c56efa2c4b5681d965aa6f2ce6c3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4588379eaa268d79fe8f8e4457b009f204a5fb7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4588379eaa268d79fe8f8e4457b009f204a5fb7.hip deleted file mode 100644 index e30f149cc484..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4588379eaa268d79fe8f8e4457b009f204a5fb7.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::fp16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b493c99888d82cd2852bfb101f99a2e6a27665b8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b493c99888d82cd2852bfb101f99a2e6a27665b8.hip deleted file mode 100644 index 5535687adb2d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b493c99888d82cd2852bfb101f99a2e6a27665b8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4a5715b550f67b8870ba66e1e6282a26cc1dbf3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4a5715b550f67b8870ba66e1e6282a26cc1dbf3.hip deleted file mode 100644 index d2db97ae443c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4a5715b550f67b8870ba66e1e6282a26cc1dbf3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4b037a2e262d11d3ed7d9feeb41b9e05427a739.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4b037a2e262d11d3ed7d9feeb41b9e05427a739.hip deleted file mode 100644 index e5aed0abffcf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4b037a2e262d11d3ed7d9feeb41b9e05427a739.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4bd2d206ceb237ed2c51f58abb5cbf96e39d07b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4bd2d206ceb237ed2c51f58abb5cbf96e39d07b.hip deleted file mode 100644 index 2c5b18529c01..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4bd2d206ceb237ed2c51f58abb5cbf96e39d07b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4ec377c44ac18527ca6a01bc3b146706a6e1e09.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4ec377c44ac18527ca6a01bc3b146706a6e1e09.hip deleted file mode 100644 index 7d8eb8203a20..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4ec377c44ac18527ca6a01bc3b146706a6e1e09.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4f12f10d7b968e0d8e7c23f36d3a360de74a905.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4f12f10d7b968e0d8e7c23f36d3a360de74a905.hip deleted file mode 100644 index 6e5334c6fe86..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b4f12f10d7b968e0d8e7c23f36d3a360de74a905.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b50e6df20a2426abd3d2ff2262a37c009196024c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b50e6df20a2426abd3d2ff2262a37c009196024c.hip deleted file mode 100644 index bf900936535d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b50e6df20a2426abd3d2ff2262a37c009196024c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b513834918d5ea789e2db21abece7c2d3532a7e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b513834918d5ea789e2db21abece7c2d3532a7e7.hip deleted file mode 100644 index 4d9e1ab628ed..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b513834918d5ea789e2db21abece7c2d3532a7e7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5248f443a12d96815c04409a00102923c717023.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5248f443a12d96815c04409a00102923c717023.hip deleted file mode 100644 index 3a025b212016..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5248f443a12d96815c04409a00102923c717023.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5371415448fffffd58bf014dac9f4876153657b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5371415448fffffd58bf014dac9f4876153657b.hip deleted file mode 100644 index c1f77b871dd8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5371415448fffffd58bf014dac9f4876153657b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ac596c636df55e81293228cbc53dcbb3024e5a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ac596c636df55e81293228cbc53dcbb3024e5a.hip deleted file mode 100644 index 81275b67f37a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ac596c636df55e81293228cbc53dcbb3024e5a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ba2e73df35f6e0f7317303823fde92a42b1a35.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ba2e73df35f6e0f7317303823fde92a42b1a35.hip deleted file mode 100644 index c50c8a851c37..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5ba2e73df35f6e0f7317303823fde92a42b1a35.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5bccc85f74f54a2ceb17fe3040b04fe306c53f9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5bccc85f74f54a2ceb17fe3040b04fe306c53f9.hip deleted file mode 100644 index aa77681e56f1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5bccc85f74f54a2ceb17fe3040b04fe306c53f9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c3131fb8e5a25bd4a14bc9075eb6fa01b61d02.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c3131fb8e5a25bd4a14bc9075eb6fa01b61d02.hip deleted file mode 100644 index 21bc1da79f98..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c3131fb8e5a25bd4a14bc9075eb6fa01b61d02.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c7fca1f76a31b0390e92d90d569fab94d4f783.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c7fca1f76a31b0390e92d90d569fab94d4f783.hip deleted file mode 100644 index 4b2061e07074..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5c7fca1f76a31b0390e92d90d569fab94d4f783.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5db3d5b1d8af89381fc4b8073f84c5fa25fdef5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5db3d5b1d8af89381fc4b8073f84c5fa25fdef5.hip deleted file mode 100644 index a633d45545e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b5db3d5b1d8af89381fc4b8073f84c5fa25fdef5.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b60a4e87a7aabfe3c1ce02b408522f3ec862e3d7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b60a4e87a7aabfe3c1ce02b408522f3ec862e3d7.hip deleted file mode 100644 index 4aac2ba6d11f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b60a4e87a7aabfe3c1ce02b408522f3ec862e3d7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b6b17ae67adee9e56a022cd2a5514fb9c4e99920.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b6b17ae67adee9e56a022cd2a5514fb9c4e99920.hip deleted file mode 100644 index b4dd9a5f26f2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b6b17ae67adee9e56a022cd2a5514fb9c4e99920.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b72a804bb3c99830653d41ac0bd49943c801b89a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b72a804bb3c99830653d41ac0bd49943c801b89a.hip deleted file mode 100644 index 49f0019e84d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b72a804bb3c99830653d41ac0bd49943c801b89a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b737410b404a51043fc3bd503c0b107c297e4c9f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b737410b404a51043fc3bd503c0b107c297e4c9f.hip deleted file mode 100644 index 669c4f50cc2a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b737410b404a51043fc3bd503c0b107c297e4c9f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b75843bb13058ffe29251e053800c509c7590544.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b75843bb13058ffe29251e053800c509c7590544.hip deleted file mode 100644 index 8661dd83e17d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b75843bb13058ffe29251e053800c509c7590544.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b774450ebadaacf23e944aaf8ca90eada01e8a5a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b774450ebadaacf23e944aaf8ca90eada01e8a5a.hip deleted file mode 100644 index cd5ca1de7620..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b774450ebadaacf23e944aaf8ca90eada01e8a5a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b779cc0b0380e1e6a2b51fc6216fdd72215b882b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b779cc0b0380e1e6a2b51fc6216fdd72215b882b.hip deleted file mode 100644 index 57931b569314..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b779cc0b0380e1e6a2b51fc6216fdd72215b882b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b7a03ab0b7887cc7ed0cb40e56360a8d36c0bb8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b7a03ab0b7887cc7ed0cb40e56360a8d36c0bb8e.hip deleted file mode 100644 index 900530389684..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b7a03ab0b7887cc7ed0cb40e56360a8d36c0bb8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b80d0828ba6d24ea3c1a97bd9835ee937b4b32fb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b80d0828ba6d24ea3c1a97bd9835ee937b4b32fb.hip deleted file mode 100644 index 3f2cf269e13c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b80d0828ba6d24ea3c1a97bd9835ee937b4b32fb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b872f9e6ebe330cc1818ea82b53acec79a2f672c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b872f9e6ebe330cc1818ea82b53acec79a2f672c.hip deleted file mode 100644 index 9831aad701fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b872f9e6ebe330cc1818ea82b53acec79a2f672c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b8fbc6f6e9c515edce3c7a438b3bc308b30d3857.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b8fbc6f6e9c515edce3c7a438b3bc308b30d3857.hip deleted file mode 100644 index 11c22fde0ecd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b8fbc6f6e9c515edce3c7a438b3bc308b30d3857.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9385db12001110c42eff6aabad935a69ad3afe2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9385db12001110c42eff6aabad935a69ad3afe2.hip deleted file mode 100644 index 1e0d64fb54a2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9385db12001110c42eff6aabad935a69ad3afe2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9559dd36a0a4f5e068a722e285f485137bd5ef0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9559dd36a0a4f5e068a722e285f485137bd5ef0.hip deleted file mode 100644 index 176a67e35c82..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9559dd36a0a4f5e068a722e285f485137bd5ef0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9627f9c8d0088df0364a64643f2b5dcd951f2bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9627f9c8d0088df0364a64643f2b5dcd951f2bb.hip deleted file mode 100644 index 987ed2a7e99f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9627f9c8d0088df0364a64643f2b5dcd951f2bb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9a742ceeb6736a2c8f9439d0b05e10d3e0c5c6f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9a742ceeb6736a2c8f9439d0b05e10d3e0c5c6f.hip deleted file mode 100644 index 69fda307714b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9a742ceeb6736a2c8f9439d0b05e10d3e0c5c6f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9baf70220079e6d4e87eb01a7259923d8a01e29.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9baf70220079e6d4e87eb01a7259923d8a01e29.hip deleted file mode 100644 index d3507927033c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9baf70220079e6d4e87eb01a7259923d8a01e29.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9d00ab8373747a5c6b9d2f8dd50ceb14db4163c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9d00ab8373747a5c6b9d2f8dd50ceb14db4163c.hip deleted file mode 100644 index eaa319d9dede..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9d00ab8373747a5c6b9d2f8dd50ceb14db4163c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9ed0a64deb55616646ea98b21a891c971cd98ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9ed0a64deb55616646ea98b21a891c971cd98ad.hip deleted file mode 100644 index f189b923c12b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_b9ed0a64deb55616646ea98b21a891c971cd98ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba145535e53899fe127987aa854f81234a9c51c4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba145535e53899fe127987aa854f81234a9c51c4.hip deleted file mode 100644 index a118777df1c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba145535e53899fe127987aa854f81234a9c51c4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba8b09f0aaa40a7c9ad5f0458b460d3e328f3c74.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba8b09f0aaa40a7c9ad5f0458b460d3e328f3c74.hip deleted file mode 100644 index c3c11ed810a1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ba8b09f0aaa40a7c9ad5f0458b460d3e328f3c74.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bafbef3f13d429ec3e9f4672218998d5669d79f2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bafbef3f13d429ec3e9f4672218998d5669d79f2.hip deleted file mode 100644 index 4a7bd2784662..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bafbef3f13d429ec3e9f4672218998d5669d79f2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb111b7acc269f8d5e70915d3efde4c425aa5f5c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb111b7acc269f8d5e70915d3efde4c425aa5f5c.hip deleted file mode 100644 index 914c23091d13..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb111b7acc269f8d5e70915d3efde4c425aa5f5c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb28a4e95723e3df380f98b5ac107c4df353850b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb28a4e95723e3df380f98b5ac107c4df353850b.hip deleted file mode 100644 index ab5a9eb00d58..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb28a4e95723e3df380f98b5ac107c4df353850b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb35c86443cc9ea38c06ebc0656306483c95ef67.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb35c86443cc9ea38c06ebc0656306483c95ef67.hip deleted file mode 100644 index ee2462813c7b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bb35c86443cc9ea38c06ebc0656306483c95ef67.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bba10ecb79ede07324e1198a71a95ff26e9eb235.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bba10ecb79ede07324e1198a71a95ff26e9eb235.hip deleted file mode 100644 index 46ff62a91264..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bba10ecb79ede07324e1198a71a95ff26e9eb235.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbe23201fbebed25781f249e5c77c31e0e7f9ddb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbe23201fbebed25781f249e5c77c31e0e7f9ddb.hip deleted file mode 100644 index 5c43fe594015..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbe23201fbebed25781f249e5c77c31e0e7f9ddb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbfd025488e52b97c04995c4c5faff371b77e4d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbfd025488e52b97c04995c4c5faff371b77e4d6.hip deleted file mode 100644 index a03a3986cd5e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bbfd025488e52b97c04995c4c5faff371b77e4d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc1ae1dddb8cc5d78196da6b26ebe66c1ce7e567.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc1ae1dddb8cc5d78196da6b26ebe66c1ce7e567.hip deleted file mode 100644 index 3d4e06bd8c45..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc1ae1dddb8cc5d78196da6b26ebe66c1ce7e567.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc238fd2095b26a167b41cdec8280182330b7b25.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc238fd2095b26a167b41cdec8280182330b7b25.hip deleted file mode 100644 index 76feafe68d3b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc238fd2095b26a167b41cdec8280182330b7b25.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4425e30a0b17e8b31726817e8d3177b5c51934.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4425e30a0b17e8b31726817e8d3177b5c51934.hip deleted file mode 100644 index 14f0af66653c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4425e30a0b17e8b31726817e8d3177b5c51934.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4e0f0496a34d2fb43c80ce0162ad4183f29064.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4e0f0496a34d2fb43c80ce0162ad4183f29064.hip deleted file mode 100644 index c371a0edef8e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc4e0f0496a34d2fb43c80ce0162ad4183f29064.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc6ce17223d8d83a64b8c96ac88223e4441a4692.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc6ce17223d8d83a64b8c96ac88223e4441a4692.hip deleted file mode 100644 index 08f3f0d96dd2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc6ce17223d8d83a64b8c96ac88223e4441a4692.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc744db85d4237ee9640f1658e0caab7648e3bb6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc744db85d4237ee9640f1658e0caab7648e3bb6.hip deleted file mode 100644 index d7de39eb8832..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc744db85d4237ee9640f1658e0caab7648e3bb6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc79e255d25744725e2a9db9f90d5cc2b8a0e0c1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc79e255d25744725e2a9db9f90d5cc2b8a0e0c1.hip deleted file mode 100644 index 3e3fbb173f3d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc79e255d25744725e2a9db9f90d5cc2b8a0e0c1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc897852a4ca992961843144f4ec4f8b86dd5e9d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc897852a4ca992961843144f4ec4f8b86dd5e9d.hip deleted file mode 100644 index 0a0ee3dfa2ad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bc897852a4ca992961843144f4ec4f8b86dd5e9d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcb6f0730fd09b4c6c60913425927dfdb8f83d82.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcb6f0730fd09b4c6c60913425927dfdb8f83d82.hip deleted file mode 100644 index 752a14102de5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcb6f0730fd09b4c6c60913425927dfdb8f83d82.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcd7ccdceb7baf3b986f2a0248827822a5f72e47.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcd7ccdceb7baf3b986f2a0248827822a5f72e47.hip deleted file mode 100644 index 42deef39d3b9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcd7ccdceb7baf3b986f2a0248827822a5f72e47.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcf8836c8cf932cc2748e313885003f0e11a887f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcf8836c8cf932cc2748e313885003f0e11a887f.hip deleted file mode 100644 index f6540f5fd184..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bcf8836c8cf932cc2748e313885003f0e11a887f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd064e302ff5b983dbdb4ccf51383fb29ddff44f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd064e302ff5b983dbdb4ccf51383fb29ddff44f.hip deleted file mode 100644 index 2e015b44dfb6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd064e302ff5b983dbdb4ccf51383fb29ddff44f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd28203f47b6a48e9b66302cf8312f3796ca500c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd28203f47b6a48e9b66302cf8312f3796ca500c.hip deleted file mode 100644 index 128e3a8c8141..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd28203f47b6a48e9b66302cf8312f3796ca500c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd37f4f7914805a97d5073f1ebf8a8b8c2648d31.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd37f4f7914805a97d5073f1ebf8a8b8c2648d31.hip deleted file mode 100644 index d9289c3ec2d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd37f4f7914805a97d5073f1ebf8a8b8c2648d31.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd3daa5f99b4522d932334924347353ce2854821.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd3daa5f99b4522d932334924347353ce2854821.hip deleted file mode 100644 index 5ae1b0d197c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd3daa5f99b4522d932334924347353ce2854821.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd6aa39d0ae3c87d011610cdb5e2e317f337c454.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd6aa39d0ae3c87d011610cdb5e2e317f337c454.hip deleted file mode 100644 index 11e91f97f28e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd6aa39d0ae3c87d011610cdb5e2e317f337c454.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd80a1774d8b7d8bee4e8663392b97cda11dcbf5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd80a1774d8b7d8bee4e8663392b97cda11dcbf5.hip deleted file mode 100644 index 4bacc1bfcc44..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd80a1774d8b7d8bee4e8663392b97cda11dcbf5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd8bf7c572c1984ca3061062cf3c31d993f6762d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd8bf7c572c1984ca3061062cf3c31d993f6762d.hip deleted file mode 100644 index e28aa30d4171..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd8bf7c572c1984ca3061062cf3c31d993f6762d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd9c47f3305e47db6ab6bc627fb3d80269633074.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd9c47f3305e47db6ab6bc627fb3d80269633074.hip deleted file mode 100644 index 33e0067cef0b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bd9c47f3305e47db6ab6bc627fb3d80269633074.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bdab172627718278a71a93e3737ef08ad9259a4f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bdab172627718278a71a93e3737ef08ad9259a4f.hip deleted file mode 100644 index d37b4f45f384..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bdab172627718278a71a93e3737ef08ad9259a4f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bde24a8dbe6add6f2dd2beb48b1280f3a84a9b2a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bde24a8dbe6add6f2dd2beb48b1280f3a84a9b2a.hip deleted file mode 100644 index cda1c331b7b2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bde24a8dbe6add6f2dd2beb48b1280f3a84a9b2a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be1e1533fc37b41838bd37edc2b6d2f2e76ae1c6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be1e1533fc37b41838bd37edc2b6d2f2e76ae1c6.hip deleted file mode 100644 index 6ec17e72b798..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be1e1533fc37b41838bd37edc2b6d2f2e76ae1c6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be4dd90ccb2f258029d0156cf23f940b694cf08d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be4dd90ccb2f258029d0156cf23f940b694cf08d.hip deleted file mode 100644 index b393c952b0ee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be4dd90ccb2f258029d0156cf23f940b694cf08d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be8ec1163a01b9cd9a802d8b44669e8770c20234.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be8ec1163a01b9cd9a802d8b44669e8770c20234.hip deleted file mode 100644 index 52ff8d83f82b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_be8ec1163a01b9cd9a802d8b44669e8770c20234.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beae876d6da465687f162136231f15767cc7bb14.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beae876d6da465687f162136231f15767cc7bb14.hip deleted file mode 100644 index 2b5c01be4bef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beae876d6da465687f162136231f15767cc7bb14.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beb9afccc15de7dfcb2e7d898abc0d61201de73e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beb9afccc15de7dfcb2e7d898abc0d61201de73e.hip deleted file mode 100644 index 5e1a6311a89a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_beb9afccc15de7dfcb2e7d898abc0d61201de73e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec30e7107c5dce3fe6aa87d83ed96da75478da0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec30e7107c5dce3fe6aa87d83ed96da75478da0.hip deleted file mode 100644 index 767ffc85d953..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec30e7107c5dce3fe6aa87d83ed96da75478da0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec9e4c0317e8d351f60258ed6611fbf365c4024.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec9e4c0317e8d351f60258ed6611fbf365c4024.hip deleted file mode 100644 index 008d0a0d6e25..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bec9e4c0317e8d351f60258ed6611fbf365c4024.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_becc2a4d7ac045365300bf8bd45fc6d3e1e1c8b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_becc2a4d7ac045365300bf8bd45fc6d3e1e1c8b1.hip deleted file mode 100644 index 8d64350df6d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_becc2a4d7ac045365300bf8bd45fc6d3e1e1c8b1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bed5a8c5cf683f6dfaefad72c2e2f5c2f2b2732f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bed5a8c5cf683f6dfaefad72c2e2f5c2f2b2732f.hip deleted file mode 100644 index 9076df635999..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bed5a8c5cf683f6dfaefad72c2e2f5c2f2b2732f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bef3bd014a918feddadc98eed92a7734f9bcd890.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bef3bd014a918feddadc98eed92a7734f9bcd890.hip deleted file mode 100644 index 33daac294c8a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bef3bd014a918feddadc98eed92a7734f9bcd890.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bf9cdf86a7944cd690b0fcbbaec235863acd10bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bf9cdf86a7944cd690b0fcbbaec235863acd10bb.hip deleted file mode 100644 index 191e41a09b8c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_bf9cdf86a7944cd690b0fcbbaec235863acd10bb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0338fbc05f86270ded7df2bd3e2758a03961b62.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0338fbc05f86270ded7df2bd3e2758a03961b62.hip deleted file mode 100644 index 602016ba584a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0338fbc05f86270ded7df2bd3e2758a03961b62.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0342686e4efd26413c6719782ed13603479c4e0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0342686e4efd26413c6719782ed13603479c4e0.hip deleted file mode 100644 index bc752acb627f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0342686e4efd26413c6719782ed13603479c4e0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c063318cb851ccaa923be12d34c84d839bc64bb8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c063318cb851ccaa923be12d34c84d839bc64bb8.hip deleted file mode 100644 index 394bfbd91fd9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c063318cb851ccaa923be12d34c84d839bc64bb8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c08095341ca7e3a1debeb780c1878e351692bee2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c08095341ca7e3a1debeb780c1878e351692bee2.hip deleted file mode 100644 index 70eae96b196d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c08095341ca7e3a1debeb780c1878e351692bee2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0a3c4ac0a50bb9b7ad764929dbee98c856b1210.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0a3c4ac0a50bb9b7ad764929dbee98c856b1210.hip deleted file mode 100644 index 6b46e8921ccf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0a3c4ac0a50bb9b7ad764929dbee98c856b1210.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0f76aff077c28f8afd7b22f284cf2894e08a043.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0f76aff077c28f8afd7b22f284cf2894e08a043.hip deleted file mode 100644 index 27417e41b7d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c0f76aff077c28f8afd7b22f284cf2894e08a043.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c112c01d201c366bdd7acccf2e1b18b00f671153.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c112c01d201c366bdd7acccf2e1b18b00f671153.hip deleted file mode 100644 index 0f5c8d6e4adb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c112c01d201c366bdd7acccf2e1b18b00f671153.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c11d68fe766fc753c657362673704005b538660b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c11d68fe766fc753c657362673704005b538660b.hip deleted file mode 100644 index 60f2bf6159e7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c11d68fe766fc753c657362673704005b538660b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c137c03bf161b2ec6a9a046fa49d7bbf80ae47b8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c137c03bf161b2ec6a9a046fa49d7bbf80ae47b8.hip deleted file mode 100644 index c48443e51c40..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c137c03bf161b2ec6a9a046fa49d7bbf80ae47b8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c197d1f050f42d82e6851fa286db6f81ba197f40.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c197d1f050f42d82e6851fa286db6f81ba197f40.hip deleted file mode 100644 index 13a9d77fef10..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c197d1f050f42d82e6851fa286db6f81ba197f40.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b76bc7a17f573c0d52c07ae9ff4302662ae61f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b76bc7a17f573c0d52c07ae9ff4302662ae61f.hip deleted file mode 100644 index 8b8b7a433ac2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b76bc7a17f573c0d52c07ae9ff4302662ae61f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b94e19d762ddc33cc4e94c6675d93cbde21e3d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b94e19d762ddc33cc4e94c6675d93cbde21e3d.hip deleted file mode 100644 index be3611b8829b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1b94e19d762ddc33cc4e94c6675d93cbde21e3d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f40c3421b9ad8cf43940530ec50bcf620058f2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f40c3421b9ad8cf43940530ec50bcf620058f2.hip deleted file mode 100644 index 3dd173a4a495..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f40c3421b9ad8cf43940530ec50bcf620058f2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f721a330b2d0fac13b22061616d7b10c0f91e9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f721a330b2d0fac13b22061616d7b10c0f91e9.hip deleted file mode 100644 index 1091b95947f5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c1f721a330b2d0fac13b22061616d7b10c0f91e9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c250ea59ab6e1ee39cce15cbd3f181047cdee31a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c250ea59ab6e1ee39cce15cbd3f181047cdee31a.hip deleted file mode 100644 index 8e8f17dae6af..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c250ea59ab6e1ee39cce15cbd3f181047cdee31a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2541b6b5cf27de3f45f60671d36602f07ce1783.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2541b6b5cf27de3f45f60671d36602f07ce1783.hip deleted file mode 100644 index 42638c0f3ad6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2541b6b5cf27de3f45f60671d36602f07ce1783.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c27b3026f1dc3056dee3a3e64bf31c45683607c9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c27b3026f1dc3056dee3a3e64bf31c45683607c9.hip deleted file mode 100644 index 913d62130eb9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c27b3026f1dc3056dee3a3e64bf31c45683607c9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c28de8f96c8315877031a2d56261e95fee6aef44.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c28de8f96c8315877031a2d56261e95fee6aef44.hip deleted file mode 100644 index c57a9de06d86..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c28de8f96c8315877031a2d56261e95fee6aef44.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c29110dd501853e87ebc122dd1971b0bb1bcd92f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c29110dd501853e87ebc122dd1971b0bb1bcd92f.hip deleted file mode 100644 index 87a12c06c1d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c29110dd501853e87ebc122dd1971b0bb1bcd92f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2940fd05efd52bdf8a3f9aa4b78bde9b5809b34.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2940fd05efd52bdf8a3f9aa4b78bde9b5809b34.hip deleted file mode 100644 index 80bfad3347f6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2940fd05efd52bdf8a3f9aa4b78bde9b5809b34.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2a2856bf9a81544a30d535a13554e3a8107c476.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2a2856bf9a81544a30d535a13554e3a8107c476.hip deleted file mode 100644 index 456dd8659aa3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2a2856bf9a81544a30d535a13554e3a8107c476.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2b719893a4d8a1e71857966d399f06c0a41749c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2b719893a4d8a1e71857966d399f06c0a41749c.hip deleted file mode 100644 index 236caac1bde5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2b719893a4d8a1e71857966d399f06c0a41749c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2f04447e6a94c94a2315454e71d7d607a9fd0f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2f04447e6a94c94a2315454e71d7d607a9fd0f8.hip deleted file mode 100644 index f141d2d50ecf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2f04447e6a94c94a2315454e71d7d607a9fd0f8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2fcced07cc194a8050bc7b2f791453b3f5b2064.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2fcced07cc194a8050bc7b2f791453b3f5b2064.hip deleted file mode 100644 index 1782df95df2e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c2fcced07cc194a8050bc7b2f791453b3f5b2064.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c323a4d1f24d59bddd20ed2f2fb6446627b0ae8b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c323a4d1f24d59bddd20ed2f2fb6446627b0ae8b.hip deleted file mode 100644 index 49da9084941d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c323a4d1f24d59bddd20ed2f2fb6446627b0ae8b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c355189ade9b1a8269230232db754a3881b53168.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c355189ade9b1a8269230232db754a3881b53168.hip deleted file mode 100644 index 7445c3af504c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c355189ade9b1a8269230232db754a3881b53168.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c35ea54eb6cd0f3756c462c66d9be956279b46ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c35ea54eb6cd0f3756c462c66d9be956279b46ad.hip deleted file mode 100644 index ce40ce2f6da8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c35ea54eb6cd0f3756c462c66d9be956279b46ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c363ee1b087f6b504a3dd3972b96e77db02b0582.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c363ee1b087f6b504a3dd3972b96e77db02b0582.hip deleted file mode 100644 index f83da199a20c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c363ee1b087f6b504a3dd3972b96e77db02b0582.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3cfaf0d53869c373f6d0ec821b008dbb819141a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3cfaf0d53869c373f6d0ec821b008dbb819141a.hip deleted file mode 100644 index 5cb2f2c3aac3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3cfaf0d53869c373f6d0ec821b008dbb819141a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3d0eaf9399c863d672e8c08d123739bab837d4b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3d0eaf9399c863d672e8c08d123739bab837d4b.hip deleted file mode 100644 index f8badbef3afe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c3d0eaf9399c863d672e8c08d123739bab837d4b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4015f0d0a7a5173810f6f17c00065e03fc61a89.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4015f0d0a7a5173810f6f17c00065e03fc61a89.hip deleted file mode 100644 index 6295804a8f97..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4015f0d0a7a5173810f6f17c00065e03fc61a89.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c402e84359b2037a29efd1d6ce7213ba7605ab25.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c402e84359b2037a29efd1d6ce7213ba7605ab25.hip deleted file mode 100644 index f2617a099b88..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c402e84359b2037a29efd1d6ce7213ba7605ab25.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c41b6eda4f250da059fe0c428428219ff5a250ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c41b6eda4f250da059fe0c428428219ff5a250ef.hip deleted file mode 100644 index 16787715cd59..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c41b6eda4f250da059fe0c428428219ff5a250ef.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c42ab428503e8f8bfa78c8cb8d9afad9f5185118.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c42ab428503e8f8bfa78c8cb8d9afad9f5185118.hip deleted file mode 100644 index 7b27936874de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c42ab428503e8f8bfa78c8cb8d9afad9f5185118.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4376ac8d82db1bc25fa273a80dfbf8b71ee5e2b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4376ac8d82db1bc25fa273a80dfbf8b71ee5e2b.hip deleted file mode 100644 index 022a42e53b36..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4376ac8d82db1bc25fa273a80dfbf8b71ee5e2b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c45a5e40f6a66bc5292a56e0097c69fe37cedfb3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c45a5e40f6a66bc5292a56e0097c69fe37cedfb3.hip deleted file mode 100644 index 91d6fbd9347f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c45a5e40f6a66bc5292a56e0097c69fe37cedfb3.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c487a1a9933239270f44b1e08e1cf5323521c089.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c487a1a9933239270f44b1e08e1cf5323521c089.hip deleted file mode 100644 index bacc30d483ef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c487a1a9933239270f44b1e08e1cf5323521c089.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4997f79435cf64add10506acb97d0647cfbb3d4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4997f79435cf64add10506acb97d0647cfbb3d4.hip deleted file mode 100644 index 0a6649fa1e74..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4997f79435cf64add10506acb97d0647cfbb3d4.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4b34d3cb673447773f6da23e9cf52b98e99f718.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4b34d3cb673447773f6da23e9cf52b98e99f718.hip deleted file mode 100644 index 8d51e201f81d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4b34d3cb673447773f6da23e9cf52b98e99f718.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c3425fe683d35dc3335db77d183ad1620b7a92.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c3425fe683d35dc3335db77d183ad1620b7a92.hip deleted file mode 100644 index 98c92d20d8a0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c3425fe683d35dc3335db77d183ad1620b7a92.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c6c405cefe204824e8fad1b3dd34bba87e796a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c6c405cefe204824e8fad1b3dd34bba87e796a.hip deleted file mode 100644 index ee753848c646..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4c6c405cefe204824e8fad1b3dd34bba87e796a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4de1bc135191f3c2aff740f4c6bb7e98da42f84.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4de1bc135191f3c2aff740f4c6bb7e98da42f84.hip deleted file mode 100644 index 9d8e124a5c5f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4de1bc135191f3c2aff740f4c6bb7e98da42f84.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4dec99707511cebd9188d216ee0a148d729b470.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4dec99707511cebd9188d216ee0a148d729b470.hip deleted file mode 100644 index cfc5294375be..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c4dec99707511cebd9188d216ee0a148d729b470.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c538dc4f65d02776875627cbd20a9c794d70b043.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c538dc4f65d02776875627cbd20a9c794d70b043.hip deleted file mode 100644 index ea23fa40a6f4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c538dc4f65d02776875627cbd20a9c794d70b043.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c53e295b68e807774ed31bb914e4bc59312a77d7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c53e295b68e807774ed31bb914e4bc59312a77d7.hip deleted file mode 100644 index 0f2e34ea0bdb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c53e295b68e807774ed31bb914e4bc59312a77d7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c56aa150611b0d4800470c1493dc907082a5c23f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c56aa150611b0d4800470c1493dc907082a5c23f.hip deleted file mode 100644 index a9e3d97b44a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c56aa150611b0d4800470c1493dc907082a5c23f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c581974c8b6f43f60d0af29c350d850b55c03121.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c581974c8b6f43f60d0af29c350d850b55c03121.hip deleted file mode 100644 index 529e8114dc5a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c581974c8b6f43f60d0af29c350d850b55c03121.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59937be2b9a13d6520fdcc922e4e75c9fa085ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59937be2b9a13d6520fdcc922e4e75c9fa085ab.hip deleted file mode 100644 index ae94ab9da8d2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59937be2b9a13d6520fdcc922e4e75c9fa085ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59a22c6efd8bb8815887325aa0b739e260cc754.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59a22c6efd8bb8815887325aa0b739e260cc754.hip deleted file mode 100644 index 17e79e74eec3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59a22c6efd8bb8815887325aa0b739e260cc754.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59ab718fa23f24f09a713ac28a339208a7a5802.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59ab718fa23f24f09a713ac28a339208a7a5802.hip deleted file mode 100644 index 905688519494..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c59ab718fa23f24f09a713ac28a339208a7a5802.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5b440ca9a5196ee1e72c878c87d96934e9273c8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5b440ca9a5196ee1e72c878c87d96934e9273c8.hip deleted file mode 100644 index 3fc61faf14ba..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5b440ca9a5196ee1e72c878c87d96934e9273c8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fcdea177734366d3bf283317a65cc3fffda611.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fcdea177734366d3bf283317a65cc3fffda611.hip deleted file mode 100644 index 513bb8eab599..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fcdea177734366d3bf283317a65cc3fffda611.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fef330a975002ed15670e8e7b26a10376d3cb7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fef330a975002ed15670e8e7b26a10376d3cb7.hip deleted file mode 100644 index 5ae228165bd6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c5fef330a975002ed15670e8e7b26a10376d3cb7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c64f4cdce32189065362a502105c31bd2d9d99a4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c64f4cdce32189065362a502105c31bd2d9d99a4.hip deleted file mode 100644 index 40fde5dd421f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c64f4cdce32189065362a502105c31bd2d9d99a4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c6e2da8b791d31f4ba05ef5f833fd6dea9e35f1c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c6e2da8b791d31f4ba05ef5f833fd6dea9e35f1c.hip deleted file mode 100644 index 131ecc94e538..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c6e2da8b791d31f4ba05ef5f833fd6dea9e35f1c.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7568e11e44ce70924d27e683190422cfae5c31d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7568e11e44ce70924d27e683190422cfae5c31d.hip deleted file mode 100644 index d09808107a78..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7568e11e44ce70924d27e683190422cfae5c31d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7af2bbfac25de2853be344b9f636226c1c0112d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7af2bbfac25de2853be344b9f636226c1c0112d.hip deleted file mode 100644 index f938c131727b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c7af2bbfac25de2853be344b9f636226c1c0112d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c806d7803d06ef8aac1d5caac9f36aafd47653d5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c806d7803d06ef8aac1d5caac9f36aafd47653d5.hip deleted file mode 100644 index 94661ebbba9a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c806d7803d06ef8aac1d5caac9f36aafd47653d5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c80dce1a17d073259250ec0c87ade69e639ffa8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c80dce1a17d073259250ec0c87ade69e639ffa8e.hip deleted file mode 100644 index cc0ee6ec6530..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c80dce1a17d073259250ec0c87ade69e639ffa8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8dbfaffc8a9b573f194f9c63f1175d9725f8950.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8dbfaffc8a9b573f194f9c63f1175d9725f8950.hip deleted file mode 100644 index ca5e85e996fd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8dbfaffc8a9b573f194f9c63f1175d9725f8950.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8f6461673882d636772ae4d26e78eabcb568f31.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8f6461673882d636772ae4d26e78eabcb568f31.hip deleted file mode 100644 index a2d5ba81161f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c8f6461673882d636772ae4d26e78eabcb568f31.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c919b8ed877d4244d01a17ecb948b459e361ff24.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c919b8ed877d4244d01a17ecb948b459e361ff24.hip deleted file mode 100644 index f0f9c3733cf6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c919b8ed877d4244d01a17ecb948b459e361ff24.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c921a4790f982d48bcaf950123c699647afb739b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c921a4790f982d48bcaf950123c699647afb739b.hip deleted file mode 100644 index dc436e44e37e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c921a4790f982d48bcaf950123c699647afb739b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9312d7159369d13f3148a6f0882dfad6921ceec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9312d7159369d13f3148a6f0882dfad6921ceec.hip deleted file mode 100644 index b3256b0f5c6a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9312d7159369d13f3148a6f0882dfad6921ceec.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9530e20038eb40c49bc8b045be0cf4e7e6b4eac.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9530e20038eb40c49bc8b045be0cf4e7e6b4eac.hip deleted file mode 100644 index 2e96f767ae26..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9530e20038eb40c49bc8b045be0cf4e7e6b4eac.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c977735a36c325706bd19a12df66ed0839b032b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c977735a36c325706bd19a12df66ed0839b032b1.hip deleted file mode 100644 index 6d56b66518f3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c977735a36c325706bd19a12df66ed0839b032b1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ad71883a19b522486706d3705700c012a6fc19.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ad71883a19b522486706d3705700c012a6fc19.hip deleted file mode 100644 index 411be43aef39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ad71883a19b522486706d3705700c012a6fc19.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ba0a3369d4e4eaea1c902a90e6501f232dd57c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ba0a3369d4e4eaea1c902a90e6501f232dd57c.hip deleted file mode 100644 index 49a9b123990c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9ba0a3369d4e4eaea1c902a90e6501f232dd57c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f1e7e478a2208c4d32e2d7e6abebdc16bcc5fe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f1e7e478a2208c4d32e2d7e6abebdc16bcc5fe.hip deleted file mode 100644 index cdd3aa0ddde3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f1e7e478a2208c4d32e2d7e6abebdc16bcc5fe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f28230817c9d9805c41dfcd4e834fe302e1df1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f28230817c9d9805c41dfcd4e834fe302e1df1.hip deleted file mode 100644 index 0c350a844d3b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9f28230817c9d9805c41dfcd4e834fe302e1df1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fb8343e623e46f01893a2b61345d1ca5928671.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fb8343e623e46f01893a2b61345d1ca5928671.hip deleted file mode 100644 index 5f4eb47dab4d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fb8343e623e46f01893a2b61345d1ca5928671.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fe51f982abd60e567d4238d3266fb60e45814b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fe51f982abd60e567d4238d3266fb60e45814b.hip deleted file mode 100644 index 012cc9795d4a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_c9fe51f982abd60e567d4238d3266fb60e45814b.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::bf16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca00cfdc5592b7440d72482a18781e9cf3afb05a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca00cfdc5592b7440d72482a18781e9cf3afb05a.hip deleted file mode 100644 index 759d7de093d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca00cfdc5592b7440d72482a18781e9cf3afb05a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca1992a2634cd6674076611be54197c715ad8271.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca1992a2634cd6674076611be54197c715ad8271.hip deleted file mode 100644 index 81942aff4b81..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca1992a2634cd6674076611be54197c715ad8271.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3975efd767ddf7c12e308d948bdcaf0968493a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3975efd767ddf7c12e308d948bdcaf0968493a.hip deleted file mode 100644 index eb4ddb914fc3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3975efd767ddf7c12e308d948bdcaf0968493a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3d98ff43fbb80ceb82fc22ab039bee898969b0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3d98ff43fbb80ceb82fc22ab039bee898969b0.hip deleted file mode 100644 index 3a0c6d636142..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca3d98ff43fbb80ceb82fc22ab039bee898969b0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca4c6ad28aff1976c6dd36974ec3b339aa3090e9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca4c6ad28aff1976c6dd36974ec3b339aa3090e9.hip deleted file mode 100644 index f3d182ec8061..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca4c6ad28aff1976c6dd36974ec3b339aa3090e9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca5681d4e5871aacef74bdba9e368445875252d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca5681d4e5871aacef74bdba9e368445875252d3.hip deleted file mode 100644 index 39af1d4f8e71..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca5681d4e5871aacef74bdba9e368445875252d3.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca920c3239bb5796b1ab2fc75177eb3b820aa784.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca920c3239bb5796b1ab2fc75177eb3b820aa784.hip deleted file mode 100644 index fb307c260c41..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ca920c3239bb5796b1ab2fc75177eb3b820aa784.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cabb7b12cdd9b8b522af577e13232b2459dbd38d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cabb7b12cdd9b8b522af577e13232b2459dbd38d.hip deleted file mode 100644 index bfbacdddbd4e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cabb7b12cdd9b8b522af577e13232b2459dbd38d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cae6c7efbfc831e2bcfc8c1efa1a486c02627cbf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cae6c7efbfc831e2bcfc8c1efa1a486c02627cbf.hip deleted file mode 100644 index 4d85ee9b645e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cae6c7efbfc831e2bcfc8c1efa1a486c02627cbf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_caede7a18f3e3d5e24f6c70392413a2cda16ac15.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_caede7a18f3e3d5e24f6c70392413a2cda16ac15.hip deleted file mode 100644 index 06f1e87193b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_caede7a18f3e3d5e24f6c70392413a2cda16ac15.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb10303a0b79f2710eb7c66896d3c1f8b12c04dd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb10303a0b79f2710eb7c66896d3c1f8b12c04dd.hip deleted file mode 100644 index 69efe0f023a6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb10303a0b79f2710eb7c66896d3c1f8b12c04dd.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1a0ce432c27f4cfa51731c3ef181bf60c8a727.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1a0ce432c27f4cfa51731c3ef181bf60c8a727.hip deleted file mode 100644 index b6809394551f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1a0ce432c27f4cfa51731c3ef181bf60c8a727.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1b91c16e0255fe7a0a85638b98d94634e143a9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1b91c16e0255fe7a0a85638b98d94634e143a9.hip deleted file mode 100644 index 36ded3caa160..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1b91c16e0255fe7a0a85638b98d94634e143a9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1deea4f4fab0db31d46a91228601f0c272d6e6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1deea4f4fab0db31d46a91228601f0c272d6e6.hip deleted file mode 100644 index b5a8f4657859..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb1deea4f4fab0db31d46a91228601f0c272d6e6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb20538073888bdb3174a8e9c32d7449072aa753.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb20538073888bdb3174a8e9c32d7449072aa753.hip deleted file mode 100644 index 9034069a0ee8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb20538073888bdb3174a8e9c32d7449072aa753.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb3d5273945c5d40cc05c2660af2df1fb7a15f3c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb3d5273945c5d40cc05c2660af2df1fb7a15f3c.hip deleted file mode 100644 index 704f387c313c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb3d5273945c5d40cc05c2660af2df1fb7a15f3c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb4576e8ea5d59d7663f3760009a00a19e1b0667.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb4576e8ea5d59d7663f3760009a00a19e1b0667.hip deleted file mode 100644 index 8ece9799f425..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cb4576e8ea5d59d7663f3760009a00a19e1b0667.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbd571f4fe576fdb17d5f75a558cb6747087c7f2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbd571f4fe576fdb17d5f75a558cb6747087c7f2.hip deleted file mode 100644 index 1ce00dac8164..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbd571f4fe576fdb17d5f75a558cb6747087c7f2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbe5a98163e878c7697e554758ebd0597c2c1760.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbe5a98163e878c7697e554758ebd0597c2c1760.hip deleted file mode 100644 index 3b92dead1a98..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbe5a98163e878c7697e554758ebd0597c2c1760.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbf3e4d4d4837a0cb33b78c4f2767b1d93da0850.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbf3e4d4d4837a0cb33b78c4f2767b1d93da0850.hip deleted file mode 100644 index 464b0174d1dd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cbf3e4d4d4837a0cb33b78c4f2767b1d93da0850.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc127a63d56099e08125b16939dac82f0173122b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc127a63d56099e08125b16939dac82f0173122b.hip deleted file mode 100644 index 78939c36b2e1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc127a63d56099e08125b16939dac82f0173122b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc4ac5a18f57f2ebb65f7e356e858ab0d59b2133.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc4ac5a18f57f2ebb65f7e356e858ab0d59b2133.hip deleted file mode 100644 index c2c61945ec38..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc4ac5a18f57f2ebb65f7e356e858ab0d59b2133.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc54b107e1b557ea36b5cbaf7fe3dfce05415c86.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc54b107e1b557ea36b5cbaf7fe3dfce05415c86.hip deleted file mode 100644 index 7555947b0ba6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cc54b107e1b557ea36b5cbaf7fe3dfce05415c86.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccac6c0e61b65c9422c7f30fbd979031698370a9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccac6c0e61b65c9422c7f30fbd979031698370a9.hip deleted file mode 100644 index 34ae1f2682ca..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccac6c0e61b65c9422c7f30fbd979031698370a9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccd0b777df1328bf24e070ed4cdf8615bb2199fe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccd0b777df1328bf24e070ed4cdf8615bb2199fe.hip deleted file mode 100644 index e134e6ea5168..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ccd0b777df1328bf24e070ed4cdf8615bb2199fe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd0453a5c3828c1358360f31f5d3b7258e17fdb9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd0453a5c3828c1358360f31f5d3b7258e17fdb9.hip deleted file mode 100644 index 435630ecd9dc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd0453a5c3828c1358360f31f5d3b7258e17fdb9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd4efcdd12184211c74e7b3f2f30fecf1041ca32.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd4efcdd12184211c74e7b3f2f30fecf1041ca32.hip deleted file mode 100644 index 75a9e30603d2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd4efcdd12184211c74e7b3f2f30fecf1041ca32.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd757a8bbeabd16a44d149ab188430f6d79ddcaf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd757a8bbeabd16a44d149ab188430f6d79ddcaf.hip deleted file mode 100644 index 1933aed73476..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cd757a8bbeabd16a44d149ab188430f6d79ddcaf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cde0582e1aef74f9209de638b553ec0671476258.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cde0582e1aef74f9209de638b553ec0671476258.hip deleted file mode 100644 index 198c46f1f6c7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cde0582e1aef74f9209de638b553ec0671476258.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce4714e4f33340859c106a3129993e22652262e2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce4714e4f33340859c106a3129993e22652262e2.hip deleted file mode 100644 index 4de81137449b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce4714e4f33340859c106a3129993e22652262e2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5064e27ba427cb951f7e1b01328b0beb6b2b7c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5064e27ba427cb951f7e1b01328b0beb6b2b7c.hip deleted file mode 100644 index 667194c23742..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5064e27ba427cb951f7e1b01328b0beb6b2b7c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5ad502dd40353312d561e9f40aa478c16ef5b1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5ad502dd40353312d561e9f40aa478c16ef5b1.hip deleted file mode 100644 index 7289ddad2559..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5ad502dd40353312d561e9f40aa478c16ef5b1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5b5932f6df9a194ceb0d69220fba9596528eec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5b5932f6df9a194ceb0d69220fba9596528eec.hip deleted file mode 100644 index 7b0e36bfc9de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5b5932f6df9a194ceb0d69220fba9596528eec.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5c161b725becf059fb4439c668edd454ac77d1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5c161b725becf059fb4439c668edd454ac77d1.hip deleted file mode 100644 index c595a779cdea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce5c161b725becf059fb4439c668edd454ac77d1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce909cb5f96a4884caa0d2eb8c5e6bc7fa352797.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce909cb5f96a4884caa0d2eb8c5e6bc7fa352797.hip deleted file mode 100644 index 16d62af31743..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ce909cb5f96a4884caa0d2eb8c5e6bc7fa352797.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ceb9544e2a0caae2c9e3dd8bbd2c509e8dca1379.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ceb9544e2a0caae2c9e3dd8bbd2c509e8dca1379.hip deleted file mode 100644 index 2e46e916aac9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ceb9544e2a0caae2c9e3dd8bbd2c509e8dca1379.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cee81ab2e2678816c7b516d2d4c50e8cb5874c68.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cee81ab2e2678816c7b516d2d4c50e8cb5874c68.hip deleted file mode 100644 index b587145a630d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cee81ab2e2678816c7b516d2d4c50e8cb5874c68.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf5c6c0bfaf98f6e655fc443246b81fcc730fe97.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf5c6c0bfaf98f6e655fc443246b81fcc730fe97.hip deleted file mode 100644 index 306d9691df38..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf5c6c0bfaf98f6e655fc443246b81fcc730fe97.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf73e1fc0015094861ca0c1c81bacdbe0c5b8f37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf73e1fc0015094861ca0c1c81bacdbe0c5b8f37.hip deleted file mode 100644 index ccbb0403a4cd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cf73e1fc0015094861ca0c1c81bacdbe0c5b8f37.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfda56a4eb08b803332f25bda6209932d9624acc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfda56a4eb08b803332f25bda6209932d9624acc.hip deleted file mode 100644 index 9d8de36c2388..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfda56a4eb08b803332f25bda6209932d9624acc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfec97bdfb6fa95e057eaf5a8138853e1c0884f2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfec97bdfb6fa95e057eaf5a8138853e1c0884f2.hip deleted file mode 100644 index 4975d2a08b39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_cfec97bdfb6fa95e057eaf5a8138853e1c0884f2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d00f65bc99ca08eba66564d34f72f2769bff9491.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d00f65bc99ca08eba66564d34f72f2769bff9491.hip deleted file mode 100644 index 153be1294071..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d00f65bc99ca08eba66564d34f72f2769bff9491.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d036096f49a89730f8af7e75457c88cb8ae64165.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d036096f49a89730f8af7e75457c88cb8ae64165.hip deleted file mode 100644 index f9e1d3a26fa1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d036096f49a89730f8af7e75457c88cb8ae64165.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d049a1b8f4c1c6d37973ce38593efda1de8ce0cd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d049a1b8f4c1c6d37973ce38593efda1de8ce0cd.hip deleted file mode 100644 index 0333b3fbd6d6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d049a1b8f4c1c6d37973ce38593efda1de8ce0cd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d04dc4ed02eb42c3fe303342801ed3073a0dcb8e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d04dc4ed02eb42c3fe303342801ed3073a0dcb8e.hip deleted file mode 100644 index f4bbd80c1a8c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d04dc4ed02eb42c3fe303342801ed3073a0dcb8e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d06ba4c996570ddab77b6ff1e2a0101b638543eb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d06ba4c996570ddab77b6ff1e2a0101b638543eb.hip deleted file mode 100644 index c1041812f460..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d06ba4c996570ddab77b6ff1e2a0101b638543eb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0863830fc5d43dc6d6400280e892bb7de2892d4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0863830fc5d43dc6d6400280e892bb7de2892d4.hip deleted file mode 100644 index 980649f7afd2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0863830fc5d43dc6d6400280e892bb7de2892d4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d090b771a4f9750132f549c82a88b4ab00dce5c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d090b771a4f9750132f549c82a88b4ab00dce5c7.hip deleted file mode 100644 index 6af910f2d430..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d090b771a4f9750132f549c82a88b4ab00dce5c7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0b09e8513646fbb2a007544a63ec9e2b04dc4c2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0b09e8513646fbb2a007544a63ec9e2b04dc4c2.hip deleted file mode 100644 index 09a6a7583fd9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0b09e8513646fbb2a007544a63ec9e2b04dc4c2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0daa59f5dce6fc3965193ae37d8c82a3d1834e6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0daa59f5dce6fc3965193ae37d8c82a3d1834e6.hip deleted file mode 100644 index aa1a8da5be92..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0daa59f5dce6fc3965193ae37d8c82a3d1834e6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0dd0165ee91c095a19ceddf08789e3576912590.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0dd0165ee91c095a19ceddf08789e3576912590.hip deleted file mode 100644 index 01a5d42cc233..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0dd0165ee91c095a19ceddf08789e3576912590.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0de618ff3ea9f67b90f2227fb7fcc74ea34183d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0de618ff3ea9f67b90f2227fb7fcc74ea34183d.hip deleted file mode 100644 index 976954f04e90..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0de618ff3ea9f67b90f2227fb7fcc74ea34183d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0f63cafbeb445408c884727b473667fb479675e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0f63cafbeb445408c884727b473667fb479675e.hip deleted file mode 100644 index e8c968a70cae..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d0f63cafbeb445408c884727b473667fb479675e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d137b7b6e04e1caf43a62bd6788a75361cfa98f6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d137b7b6e04e1caf43a62bd6788a75361cfa98f6.hip deleted file mode 100644 index b8a62d801652..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d137b7b6e04e1caf43a62bd6788a75361cfa98f6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1840494c4fa78ff399c0399b3ad7ca3d22d4587.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1840494c4fa78ff399c0399b3ad7ca3d22d4587.hip deleted file mode 100644 index 424d1718b8db..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1840494c4fa78ff399c0399b3ad7ca3d22d4587.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d18727988e47264b42b4153dc82fc1a750f08db0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d18727988e47264b42b4153dc82fc1a750f08db0.hip deleted file mode 100644 index 8bda0c0c3e3d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d18727988e47264b42b4153dc82fc1a750f08db0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c0dfd19a08d61586758091370acbdc6f267017.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c0dfd19a08d61586758091370acbdc6f267017.hip deleted file mode 100644 index af5115b8125e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c0dfd19a08d61586758091370acbdc6f267017.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c25cfc437d8bd803860e39a45b2f3b9fa48393.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c25cfc437d8bd803860e39a45b2f3b9fa48393.hip deleted file mode 100644 index c9eb1a48c620..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1c25cfc437d8bd803860e39a45b2f3b9fa48393.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1d3eacc320104100bce46235fe656e5a8223c66.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1d3eacc320104100bce46235fe656e5a8223c66.hip deleted file mode 100644 index 7fd064e61755..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d1d3eacc320104100bce46235fe656e5a8223c66.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d20d45aa85c0daa299da98c277cee826fe67bd27.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d20d45aa85c0daa299da98c277cee826fe67bd27.hip deleted file mode 100644 index 6c7e14a3772c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d20d45aa85c0daa299da98c277cee826fe67bd27.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d257148f457557ea80ca56690e525db3a4b0ff55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d257148f457557ea80ca56690e525db3a4b0ff55.hip deleted file mode 100644 index 4b9b9aa6d8ac..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d257148f457557ea80ca56690e525db3a4b0ff55.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d25ce4b3e9cc392ceafebc7fe3bcbe05aaad4bbc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d25ce4b3e9cc392ceafebc7fe3bcbe05aaad4bbc.hip deleted file mode 100644 index 517bebd1cc9a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d25ce4b3e9cc392ceafebc7fe3bcbe05aaad4bbc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2d08c5470a385d0160b2c1441fd1c30fff1c17c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2d08c5470a385d0160b2c1441fd1c30fff1c17c.hip deleted file mode 100644 index 9fff84645693..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2d08c5470a385d0160b2c1441fd1c30fff1c17c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2daccc4b3a0f90bff39cb4597f8b7e484613d9e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2daccc4b3a0f90bff39cb4597f8b7e484613d9e.hip deleted file mode 100644 index c5f1fd582aa8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2daccc4b3a0f90bff39cb4597f8b7e484613d9e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2dfdb42c1b380e860aa5609302f29698dd27923.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2dfdb42c1b380e860aa5609302f29698dd27923.hip deleted file mode 100644 index 94ed9dd28798..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2dfdb42c1b380e860aa5609302f29698dd27923.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2f4b869ff23874b6bde0aab68c419108b7e69f4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2f4b869ff23874b6bde0aab68c419108b7e69f4.hip deleted file mode 100644 index d3fdd90b305e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d2f4b869ff23874b6bde0aab68c419108b7e69f4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d32c64ef01aa228277d031a74df51363f98aa2b0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d32c64ef01aa228277d031a74df51363f98aa2b0.hip deleted file mode 100644 index a0051666d9cc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d32c64ef01aa228277d031a74df51363f98aa2b0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34d6cdcd81a456125ab5e0875466c6334d8e5c8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34d6cdcd81a456125ab5e0875466c6334d8e5c8.hip deleted file mode 100644 index e0ae2c571360..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34d6cdcd81a456125ab5e0875466c6334d8e5c8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34fcb56caa8f80404789fba0ffac447483a4d84.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34fcb56caa8f80404789fba0ffac447483a4d84.hip deleted file mode 100644 index bcbfb7d4ece1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d34fcb56caa8f80404789fba0ffac447483a4d84.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3784fb4c0685d7b651f4113f3c71e050881f3a5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3784fb4c0685d7b651f4113f3c71e050881f3a5.hip deleted file mode 100644 index 5b5162d095f9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3784fb4c0685d7b651f4113f3c71e050881f3a5.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a23ded424200d0c6f06b1dbd0a7b7b0e7b5d9b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a23ded424200d0c6f06b1dbd0a7b7b0e7b5d9b.hip deleted file mode 100644 index 1c6be7bb35a2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a23ded424200d0c6f06b1dbd0a7b7b0e7b5d9b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a2edf232786d458e2125f8dfeda8847f842afa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a2edf232786d458e2125f8dfeda8847f842afa.hip deleted file mode 100644 index f5c3c75ac36b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3a2edf232786d458e2125f8dfeda8847f842afa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3af8763f289dace1054bdcb4dfeda28b0aefcae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3af8763f289dace1054bdcb4dfeda28b0aefcae.hip deleted file mode 100644 index e17ac519f030..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3af8763f289dace1054bdcb4dfeda28b0aefcae.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - true, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3fce1e11aee2273620e75efe4aa0390fcde9ba5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3fce1e11aee2273620e75efe4aa0390fcde9ba5.hip deleted file mode 100644 index 09ef59db3945..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d3fce1e11aee2273620e75efe4aa0390fcde9ba5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d40569ae9dbd693c0ab3d6ba69704d31e451011b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d40569ae9dbd693c0ab3d6ba69704d31e451011b.hip deleted file mode 100644 index f38468cb1d8d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d40569ae9dbd693c0ab3d6ba69704d31e451011b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41b6a64dd181f2efa65aaed03a3d229b3566c1d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41b6a64dd181f2efa65aaed03a3d229b3566c1d.hip deleted file mode 100644 index fcccaaa3538f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41b6a64dd181f2efa65aaed03a3d229b3566c1d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41cd6b60a97e7071518cbd1a63abb8b910df024.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41cd6b60a97e7071518cbd1a63abb8b910df024.hip deleted file mode 100644 index 32007f72daa1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d41cd6b60a97e7071518cbd1a63abb8b910df024.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d43715cce8935439f90172d141050d78c7e76fb7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d43715cce8935439f90172d141050d78c7e76fb7.hip deleted file mode 100644 index ab3595779079..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d43715cce8935439f90172d141050d78c7e76fb7.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4605b2ad3e3753c5f255678abc1690b949c5abc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4605b2ad3e3753c5f255678abc1690b949c5abc.hip deleted file mode 100644 index bf74552890ad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4605b2ad3e3753c5f255678abc1690b949c5abc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4645b713821371161a9925dec8a3d6c157ba1aa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4645b713821371161a9925dec8a3d6c157ba1aa.hip deleted file mode 100644 index 3bdb63e15dcb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4645b713821371161a9925dec8a3d6c157ba1aa.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4aff499ad527be5fe33b8e92547df57af26d40d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4aff499ad527be5fe33b8e92547df57af26d40d.hip deleted file mode 100644 index 235361844edb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4aff499ad527be5fe33b8e92547df57af26d40d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4b99af9a573df50a27fccbec3fa8e350f1854eb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4b99af9a573df50a27fccbec3fa8e350f1854eb.hip deleted file mode 100644 index dbb25419bf47..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4b99af9a573df50a27fccbec3fa8e350f1854eb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4c9f975891087e6eed6393629b41155deafc509.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4c9f975891087e6eed6393629b41155deafc509.hip deleted file mode 100644 index 24b01131cdd2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d4c9f975891087e6eed6393629b41155deafc509.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d50ac8e8a03f8e7ec2c6e993dd39f09f465dab57.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d50ac8e8a03f8e7ec2c6e993dd39f09f465dab57.hip deleted file mode 100644 index 99ca50c1d7ff..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d50ac8e8a03f8e7ec2c6e993dd39f09f465dab57.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54ac01458df3f240e0656d82330f9de23ba9651.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54ac01458df3f240e0656d82330f9de23ba9651.hip deleted file mode 100644 index a15c0881098d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54ac01458df3f240e0656d82330f9de23ba9651.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54b3731883a5f8393d60d27487f8d017aedd3f9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54b3731883a5f8393d60d27487f8d017aedd3f9.hip deleted file mode 100644 index beae00879d5b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d54b3731883a5f8393d60d27487f8d017aedd3f9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5e82799f4452e148c3e02acd6526cf30757eb52.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5e82799f4452e148c3e02acd6526cf30757eb52.hip deleted file mode 100644 index 6c2885c16cdd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5e82799f4452e148c3e02acd6526cf30757eb52.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5edfe3e3dc3008b928c8e6dbd50784b905f189e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5edfe3e3dc3008b928c8e6dbd50784b905f189e.hip deleted file mode 100644 index c9b9deb0b994..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d5edfe3e3dc3008b928c8e6dbd50784b905f189e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d600779c17b7b21c18e1308e6d765fe02a7945d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d600779c17b7b21c18e1308e6d765fe02a7945d3.hip deleted file mode 100644 index dd4cce764ed3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d600779c17b7b21c18e1308e6d765fe02a7945d3.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d6149eea92f2c40c11de3b778102fcf9b6a006b8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d6149eea92f2c40c11de3b778102fcf9b6a006b8.hip deleted file mode 100644 index bb7dd9158890..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d6149eea92f2c40c11de3b778102fcf9b6a006b8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d623b36cc3f56d1001b2d3abadd8a5628fefd014.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d623b36cc3f56d1001b2d3abadd8a5628fefd014.hip deleted file mode 100644 index ba05dc9911f1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d623b36cc3f56d1001b2d3abadd8a5628fefd014.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d63c8c746055851217a514321cd735eaf6937263.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d63c8c746055851217a514321cd735eaf6937263.hip deleted file mode 100644 index 8d06bb2fa187..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d63c8c746055851217a514321cd735eaf6937263.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d64b8b52f4a98801e185e2f132b2f80c29dd0c37.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d64b8b52f4a98801e185e2f132b2f80c29dd0c37.hip deleted file mode 100644 index 9d3145849aa5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d64b8b52f4a98801e185e2f132b2f80c29dd0c37.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66b79c4ebdcfd239cecec58203606bc123bd6bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66b79c4ebdcfd239cecec58203606bc123bd6bb.hip deleted file mode 100644 index 820d61d7280d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66b79c4ebdcfd239cecec58203606bc123bd6bb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66c30148a6fa816937f2f095802264d3dfa0273.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66c30148a6fa816937f2f095802264d3dfa0273.hip deleted file mode 100644 index 7ab541dd60cf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d66c30148a6fa816937f2f095802264d3dfa0273.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d703eea8075cacec4d41fee7dc4734f593ee79e8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d703eea8075cacec4d41fee7dc4734f593ee79e8.hip deleted file mode 100644 index c6c6cf716099..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d703eea8075cacec4d41fee7dc4734f593ee79e8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d712f23ef88ae5d7b161d36f42d22a5ba53b6354.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d712f23ef88ae5d7b161d36f42d22a5ba53b6354.hip deleted file mode 100644 index 08547b31058d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d712f23ef88ae5d7b161d36f42d22a5ba53b6354.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d713fe25dc90b3511fc259cebf463376dcb55d84.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d713fe25dc90b3511fc259cebf463376dcb55d84.hip deleted file mode 100644 index bb768bb88758..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d713fe25dc90b3511fc259cebf463376dcb55d84.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7145383e39dec0e346b5094401acf85ef3c2075.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7145383e39dec0e346b5094401acf85ef3c2075.hip deleted file mode 100644 index c323c090710b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7145383e39dec0e346b5094401acf85ef3c2075.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d723b191785c97d284675f700a7baeb52a2eb791.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d723b191785c97d284675f700a7baeb52a2eb791.hip deleted file mode 100644 index 219a5fc83135..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d723b191785c97d284675f700a7baeb52a2eb791.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7290cc4c3036c9205e689cbcc60e7d16b97a7d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7290cc4c3036c9205e689cbcc60e7d16b97a7d6.hip deleted file mode 100644 index 07ac1999a0c2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7290cc4c3036c9205e689cbcc60e7d16b97a7d6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d733f4c03e338ea7c6d8f759c1132499bdcea059.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d733f4c03e338ea7c6d8f759c1132499bdcea059.hip deleted file mode 100644 index 97536b5743c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d733f4c03e338ea7c6d8f759c1132499bdcea059.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d773df9ccfc1ace90fe3afb5c00976deabedf6f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d773df9ccfc1ace90fe3afb5c00976deabedf6f8.hip deleted file mode 100644 index 14bff89c4bde..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d773df9ccfc1ace90fe3afb5c00976deabedf6f8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7adde8780b39f1364c572a19c3bfb19417678e3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7adde8780b39f1364c572a19c3bfb19417678e3.hip deleted file mode 100644 index 8e20942df458..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7adde8780b39f1364c572a19c3bfb19417678e3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7bda8157fb27d544e049fd7d2ec735725f1bf44.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7bda8157fb27d544e049fd7d2ec735725f1bf44.hip deleted file mode 100644 index 6d6266328941..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7bda8157fb27d544e049fd7d2ec735725f1bf44.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7fae2c18645d36a181a0bdd2d8ca7a4ac0f6d1d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7fae2c18645d36a181a0bdd2d8ca7a4ac0f6d1d.hip deleted file mode 100644 index 1b9cd96ceb9f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d7fae2c18645d36a181a0bdd2d8ca7a4ac0f6d1d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d82773721479613ad72e334510a248f1436b38d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d82773721479613ad72e334510a248f1436b38d6.hip deleted file mode 100644 index 7ec972e20080..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d82773721479613ad72e334510a248f1436b38d6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d867098db97b3f26e71a151c63b74260bfab21f8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d867098db97b3f26e71a151c63b74260bfab21f8.hip deleted file mode 100644 index fdc9ff3d9fd6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d867098db97b3f26e71a151c63b74260bfab21f8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d86e4dcbe9c4cac8f7c8c5d97ce384ae0cbdbfbc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d86e4dcbe9c4cac8f7c8c5d97ce384ae0cbdbfbc.hip deleted file mode 100644 index 8e100e7b34ab..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d86e4dcbe9c4cac8f7c8c5d97ce384ae0cbdbfbc.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d8901a63986cc28ef24cab012b32114851a8c1ec.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d8901a63986cc28ef24cab012b32114851a8c1ec.hip deleted file mode 100644 index d3deccfe241a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d8901a63986cc28ef24cab012b32114851a8c1ec.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9061c204d8a85c974676f4438994a0be9d69a60.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9061c204d8a85c974676f4438994a0be9d69a60.hip deleted file mode 100644 index 58f4dc0004ba..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9061c204d8a85c974676f4438994a0be9d69a60.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d924ee32b178b6bffa7a71603d6e2818f66177a5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d924ee32b178b6bffa7a71603d6e2818f66177a5.hip deleted file mode 100644 index bf00a987c4d3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d924ee32b178b6bffa7a71603d6e2818f66177a5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d937609afa8e21a761dad6b01ff3f26346e450fc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d937609afa8e21a761dad6b01ff3f26346e450fc.hip deleted file mode 100644 index a4d16e3cdd9c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d937609afa8e21a761dad6b01ff3f26346e450fc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d95835bc6f000d3a3379bbc38d90e83dcaf867ee.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d95835bc6f000d3a3379bbc38d90e83dcaf867ee.hip deleted file mode 100644 index 5eb2be47469a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d95835bc6f000d3a3379bbc38d90e83dcaf867ee.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d992eab7de49033f5480c5e86a69e675db0d2a19.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d992eab7de49033f5480c5e86a69e675db0d2a19.hip deleted file mode 100644 index 0a8788862dd7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d992eab7de49033f5480c5e86a69e675db0d2a19.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c23b7f8fcc4e4f4c81f5f00cfd345b98df2e0f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c23b7f8fcc4e4f4c81f5f00cfd345b98df2e0f.hip deleted file mode 100644 index 26cd69469ea2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c23b7f8fcc4e4f4c81f5f00cfd345b98df2e0f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c3e27b522320dcca5ee84fa534b03aae2bfea9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c3e27b522320dcca5ee84fa534b03aae2bfea9.hip deleted file mode 100644 index 27e2cec912a6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_d9c3e27b522320dcca5ee84fa534b03aae2bfea9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da07d8b5666423da30a95e3b2cabd3839d200981.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da07d8b5666423da30a95e3b2cabd3839d200981.hip deleted file mode 100644 index 882eb0659961..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da07d8b5666423da30a95e3b2cabd3839d200981.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da29a515d14dac02066bcd4701285b9916b43cf5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da29a515d14dac02066bcd4701285b9916b43cf5.hip deleted file mode 100644 index 868b3bcfef64..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da29a515d14dac02066bcd4701285b9916b43cf5.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da6afccdee4107507a64323e17bf12c46da2b92a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da6afccdee4107507a64323e17bf12c46da2b92a.hip deleted file mode 100644 index f2a99dce8204..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da6afccdee4107507a64323e17bf12c46da2b92a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da74887afedbd67928fe4d596709f9ff92530611.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da74887afedbd67928fe4d596709f9ff92530611.hip deleted file mode 100644 index bd3dd21914c5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da74887afedbd67928fe4d596709f9ff92530611.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da822ea727fb3543e445e4000f7e6ebb946d6a3b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da822ea727fb3543e445e4000f7e6ebb946d6a3b.hip deleted file mode 100644 index d73043aa58d9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da822ea727fb3543e445e4000f7e6ebb946d6a3b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da9f6e1d59132fe96709490af25bd794f267851c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da9f6e1d59132fe96709490af25bd794f267851c.hip deleted file mode 100644 index e75388caf2d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_da9f6e1d59132fe96709490af25bd794f267851c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db0d0cf55d90b3f3c9eecada1db93c420f34b1ae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db0d0cf55d90b3f3c9eecada1db93c420f34b1ae.hip deleted file mode 100644 index 69c56122a64b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db0d0cf55d90b3f3c9eecada1db93c420f34b1ae.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db5016bff9e5dc37184d2b9417eb351c7ea1c322.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db5016bff9e5dc37184d2b9417eb351c7ea1c322.hip deleted file mode 100644 index 8f34d4d0813d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db5016bff9e5dc37184d2b9417eb351c7ea1c322.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db85839ee8d464c5a81b8dad9839f5e0f4b467a8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db85839ee8d464c5a81b8dad9839f5e0f4b467a8.hip deleted file mode 100644 index fbb400e9cfc7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db85839ee8d464c5a81b8dad9839f5e0f4b467a8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db8f0bd93b352d28c5b6d78f4332026993f0bea4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db8f0bd93b352d28c5b6d78f4332026993f0bea4.hip deleted file mode 100644 index 96ecc14b5396..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_db8f0bd93b352d28c5b6d78f4332026993f0bea4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbae1670fac6812b2d2cbad973e4b475509ea504.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbae1670fac6812b2d2cbad973e4b475509ea504.hip deleted file mode 100644 index 8315731de939..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbae1670fac6812b2d2cbad973e4b475509ea504.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbb06b43d5d65429e23cc717448cf1fffb0cfd74.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbb06b43d5d65429e23cc717448cf1fffb0cfd74.hip deleted file mode 100644 index e3328b73adb4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbb06b43d5d65429e23cc717448cf1fffb0cfd74.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbc4135fce01e8731fec7a78d0cc0fdeeae28b90.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbc4135fce01e8731fec7a78d0cc0fdeeae28b90.hip deleted file mode 100644 index a95d75474613..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbc4135fce01e8731fec7a78d0cc0fdeeae28b90.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbcea8f7b5930abf76eecefce92d0db785d2df5d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbcea8f7b5930abf76eecefce92d0db785d2df5d.hip deleted file mode 100644 index 9bd274029d5d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbcea8f7b5930abf76eecefce92d0db785d2df5d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbde2ef18e2174ebe13a6e7c8c2a6b05a6612047.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbde2ef18e2174ebe13a6e7c8c2a6b05a6612047.hip deleted file mode 100644 index 06855f1bd70c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dbde2ef18e2174ebe13a6e7c8c2a6b05a6612047.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc039d422a57c159ea4dbcc867d766ff1b356a07.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc039d422a57c159ea4dbcc867d766ff1b356a07.hip deleted file mode 100644 index 9c93a8910366..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc039d422a57c159ea4dbcc867d766ff1b356a07.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc08afbff5def8bcb4e823657ce01f57c9dc77c9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc08afbff5def8bcb4e823657ce01f57c9dc77c9.hip deleted file mode 100644 index 949824c472e5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc08afbff5def8bcb4e823657ce01f57c9dc77c9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc184767d723f4995791848cdc68bd948408204f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc184767d723f4995791848cdc68bd948408204f.hip deleted file mode 100644 index 0c1b6d1f19eb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc184767d723f4995791848cdc68bd948408204f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc1a7f9b1afeba6690fdc0d0d1755ea89c805573.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc1a7f9b1afeba6690fdc0d0d1755ea89c805573.hip deleted file mode 100644 index c8a9ebbeb086..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc1a7f9b1afeba6690fdc0d0d1755ea89c805573.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc34b6ef496d4e0d8fbbe10731d4a7b1c136c036.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc34b6ef496d4e0d8fbbe10731d4a7b1c136c036.hip deleted file mode 100644 index cfc31f9c7da1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc34b6ef496d4e0d8fbbe10731d4a7b1c136c036.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc3d625c5ad3e871f5a727ac946df642d988b9ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc3d625c5ad3e871f5a727ac946df642d988b9ab.hip deleted file mode 100644 index 4d1fd757a5d7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc3d625c5ad3e871f5a727ac946df642d988b9ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc4d27535b9570b8f4b790470a83c1d0a9a2b6ce.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc4d27535b9570b8f4b790470a83c1d0a9a2b6ce.hip deleted file mode 100644 index af903c96af62..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc4d27535b9570b8f4b790470a83c1d0a9a2b6ce.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc5ba6d73f331c76e696953606c5b347b6a46f3f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc5ba6d73f331c76e696953606c5b347b6a46f3f.hip deleted file mode 100644 index 80eb63e6db38..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc5ba6d73f331c76e696953606c5b347b6a46f3f.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc62a8db637d32e7dfdb2521cbdae6e1fbbd5fd1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc62a8db637d32e7dfdb2521cbdae6e1fbbd5fd1.hip deleted file mode 100644 index 043b84f9f300..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc62a8db637d32e7dfdb2521cbdae6e1fbbd5fd1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc818f3ce244743cb1dbff9aca399df90742a6d0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc818f3ce244743cb1dbff9aca399df90742a6d0.hip deleted file mode 100644 index 2b28b28a6e94..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc818f3ce244743cb1dbff9aca399df90742a6d0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc91797c1474a368e9cb056b50b4629d7736c3cb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc91797c1474a368e9cb056b50b4629d7736c3cb.hip deleted file mode 100644 index 2a222d6728c2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc91797c1474a368e9cb056b50b4629d7736c3cb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc9e54273c0ea2358fb573a7d918aa7b09fe07f9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc9e54273c0ea2358fb573a7d918aa7b09fe07f9.hip deleted file mode 100644 index 265006b2cb94..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dc9e54273c0ea2358fb573a7d918aa7b09fe07f9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dcf815ef540060cc7ed43e1c57a28e1d080c5621.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dcf815ef540060cc7ed43e1c57a28e1d080c5621.hip deleted file mode 100644 index 4a27c80049d5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dcf815ef540060cc7ed43e1c57a28e1d080c5621.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd10bbf37503bbc92af82bc3487989b41b20ca85.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd10bbf37503bbc92af82bc3487989b41b20ca85.hip deleted file mode 100644 index 82b2ecd178cd..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd10bbf37503bbc92af82bc3487989b41b20ca85.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd11806cd2d3ef1127f676b2d98bf8fff2a1e5ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd11806cd2d3ef1127f676b2d98bf8fff2a1e5ab.hip deleted file mode 100644 index 718734612001..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd11806cd2d3ef1127f676b2d98bf8fff2a1e5ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd35634440edb25cb095800b882c70aaceca1dbb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd35634440edb25cb095800b882c70aaceca1dbb.hip deleted file mode 100644 index e4810cf30127..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd35634440edb25cb095800b882c70aaceca1dbb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd67d442001d2b167e70e8730abde4d4461b8569.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd67d442001d2b167e70e8730abde4d4461b8569.hip deleted file mode 100644 index 94216509787d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd67d442001d2b167e70e8730abde4d4461b8569.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd9494d9ac35eba6794a4f9120d2db9932596ef8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd9494d9ac35eba6794a4f9120d2db9932596ef8.hip deleted file mode 100644 index 2fb0d4ba5f50..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dd9494d9ac35eba6794a4f9120d2db9932596ef8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, true,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dda8d021381083bc48b7fb1840729254dd8e5137.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dda8d021381083bc48b7fb1840729254dd8e5137.hip deleted file mode 100644 index 2f36a58ffe77..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dda8d021381083bc48b7fb1840729254dd8e5137.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddcb1cfea1b0dbe50a02252cba99428fd977527e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddcb1cfea1b0dbe50a02252cba99428fd977527e.hip deleted file mode 100644 index 1ae834a69425..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddcb1cfea1b0dbe50a02252cba99428fd977527e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dde93ffe7fca311e136e42fbcd12b05c9fc7174c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dde93ffe7fca311e136e42fbcd12b05c9fc7174c.hip deleted file mode 100644 index b68ef694340b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dde93ffe7fca311e136e42fbcd12b05c9fc7174c.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddf5339054f47d9ed6cc7f9e66ab21ce3bccf3db.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddf5339054f47d9ed6cc7f9e66ab21ce3bccf3db.hip deleted file mode 100644 index 0e0c167c3b60..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ddf5339054f47d9ed6cc7f9e66ab21ce3bccf3db.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de1ff66d2aeb47d2fdccaa4bb6b9d066b380c99e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de1ff66d2aeb47d2fdccaa4bb6b9d066b380c99e.hip deleted file mode 100644 index 0926fc5eef32..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de1ff66d2aeb47d2fdccaa4bb6b9d066b380c99e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de26a187c4db06115072a5132e1166b5b03368b0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de26a187c4db06115072a5132e1166b5b03368b0.hip deleted file mode 100644 index 2bb11c53165b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de26a187c4db06115072a5132e1166b5b03368b0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de36bc309877917a18fd21acb30563c7e2f233c1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de36bc309877917a18fd21acb30563c7e2f233c1.hip deleted file mode 100644 index 0d63a90fde92..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de36bc309877917a18fd21acb30563c7e2f233c1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de5359f0fba3da9dfed06ddbea8fe2a33a9cf40c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de5359f0fba3da9dfed06ddbea8fe2a33a9cf40c.hip deleted file mode 100644 index e27f787455d1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de5359f0fba3da9dfed06ddbea8fe2a33a9cf40c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de6683d175affaa5ff261ab8503f64172d8eba8b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de6683d175affaa5ff261ab8503f64172d8eba8b.hip deleted file mode 100644 index 1c3096ece238..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de6683d175affaa5ff261ab8503f64172d8eba8b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de7eb562a7eff31d589e12945d80233aac202ae2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de7eb562a7eff31d589e12945d80233aac202ae2.hip deleted file mode 100644 index 34c1fc87b835..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de7eb562a7eff31d589e12945d80233aac202ae2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de85901d66dc04b1143bb6404445baf65693b781.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de85901d66dc04b1143bb6404445baf65693b781.hip deleted file mode 100644 index 9cd6fbd8d413..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_de85901d66dc04b1143bb6404445baf65693b781.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_deb9ec2cccab94920e40f62a1f0f094acd919d07.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_deb9ec2cccab94920e40f62a1f0f094acd919d07.hip deleted file mode 100644 index f7f8414dc77a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_deb9ec2cccab94920e40f62a1f0f094acd919d07.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df0b2bcba57e77d975ec5304fc50cbd09cddf4bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df0b2bcba57e77d975ec5304fc50cbd09cddf4bb.hip deleted file mode 100644 index 8c0ab4ea85f1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df0b2bcba57e77d975ec5304fc50cbd09cddf4bb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4bb75ca79f805a81fbad750ad22f6d22b0d8ff.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4bb75ca79f805a81fbad750ad22f6d22b0d8ff.hip deleted file mode 100644 index 2558cbe0b9e2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4bb75ca79f805a81fbad750ad22f6d22b0d8ff.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4c9eb48da49a61957537270d94e56cb4e426be.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4c9eb48da49a61957537270d94e56cb4e426be.hip deleted file mode 100644 index 6c8837fbec7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df4c9eb48da49a61957537270d94e56cb4e426be.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df5b1c6758d4b8540158299dd0362297083084c2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df5b1c6758d4b8540158299dd0362297083084c2.hip deleted file mode 100644 index 7231dd59a6ad..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df5b1c6758d4b8540158299dd0362297083084c2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df645b3888dc8d1df50c47c0d75822eebd3eb019.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df645b3888dc8d1df50c47c0d75822eebd3eb019.hip deleted file mode 100644 index 8d96d0d04aac..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df645b3888dc8d1df50c47c0d75822eebd3eb019.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df66feebc9a0dcc508ce002c255154622875e524.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df66feebc9a0dcc508ce002c255154622875e524.hip deleted file mode 100644 index e8fb2a96b9b5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_df66feebc9a0dcc508ce002c255154622875e524.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dfcd68acfca68d1acac94f493e25be0ef20f209f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dfcd68acfca68d1acac94f493e25be0ef20f209f.hip deleted file mode 100644 index e3c865be35f8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_dfcd68acfca68d1acac94f493e25be0ef20f209f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e02a198f23c409b715761b702d7b0e6e5992701f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e02a198f23c409b715761b702d7b0e6e5992701f.hip deleted file mode 100644 index 11b33022ae57..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e02a198f23c409b715761b702d7b0e6e5992701f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e035773419a9b3631698a3d375d829af55f7731e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e035773419a9b3631698a3d375d829af55f7731e.hip deleted file mode 100644 index 26061b55ae0a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e035773419a9b3631698a3d375d829af55f7731e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e088f0f7363804cf5403adef70828ab32d09a02a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e088f0f7363804cf5403adef70828ab32d09a02a.hip deleted file mode 100644 index e78f750aeb77..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e088f0f7363804cf5403adef70828ab32d09a02a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0966fa1ff013e477b1706928de6cb7f8587c154.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0966fa1ff013e477b1706928de6cb7f8587c154.hip deleted file mode 100644 index c09d32372ad4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0966fa1ff013e477b1706928de6cb7f8587c154.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e09d9baa269dfbb30b714389d1733be51cc419b7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e09d9baa269dfbb30b714389d1733be51cc419b7.hip deleted file mode 100644 index d2e2e921b065..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e09d9baa269dfbb30b714389d1733be51cc419b7.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 64, - 256, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<256, - ck_tile::fp16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0e48d7edfe9513f24ad9fae68cac3aa940b17dd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0e48d7edfe9513f24ad9fae68cac3aa940b17dd.hip deleted file mode 100644 index 8b37aef49222..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e0e48d7edfe9513f24ad9fae68cac3aa940b17dd.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e10f47a44400de385ddbeb99475b717c5646fb41.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e10f47a44400de385ddbeb99475b717c5646fb41.hip deleted file mode 100644 index 0ce7a98dc0c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e10f47a44400de385ddbeb99475b717c5646fb41.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e11a3b7d4fdfed64e64f7a95dbc64eff541092d6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e11a3b7d4fdfed64e64f7a95dbc64eff541092d6.hip deleted file mode 100644 index 5ec322eafb11..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e11a3b7d4fdfed64e64f7a95dbc64eff541092d6.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::bf16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e13b86fe4e153e0bfa8d1e75f3641fe32b0c5149.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e13b86fe4e153e0bfa8d1e75f3641fe32b0c5149.hip deleted file mode 100644 index 19a8ab7b04d8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e13b86fe4e153e0bfa8d1e75f3641fe32b0c5149.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16075c3a5fcfe63ba12e854bb1fed6873f014ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16075c3a5fcfe63ba12e854bb1fed6873f014ab.hip deleted file mode 100644 index 1330304ff714..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16075c3a5fcfe63ba12e854bb1fed6873f014ab.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16edb824cecf459a8ec51b8dc74b1e06369aceb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16edb824cecf459a8ec51b8dc74b1e06369aceb.hip deleted file mode 100644 index 98120391c708..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e16edb824cecf459a8ec51b8dc74b1e06369aceb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1c1a31a1d8556cbe0b6ea76faacc78855108539.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1c1a31a1d8556cbe0b6ea76faacc78855108539.hip deleted file mode 100644 index 3f2d3b26b68e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1c1a31a1d8556cbe0b6ea76faacc78855108539.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1cc934ba7baab1a2eb062df1e4ee5066e9ffbc3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1cc934ba7baab1a2eb062df1e4ee5066e9ffbc3.hip deleted file mode 100644 index d7b52329688e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1cc934ba7baab1a2eb062df1e4ee5066e9ffbc3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1d85ad2c9d197f501267fe0804e6985802fbd18.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1d85ad2c9d197f501267fe0804e6985802fbd18.hip deleted file mode 100644 index 716331a82880..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e1d85ad2c9d197f501267fe0804e6985802fbd18.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2762543d3380185e304f84749a70db1b8d3dd8c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2762543d3380185e304f84749a70db1b8d3dd8c.hip deleted file mode 100644 index afd102f264fc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2762543d3380185e304f84749a70db1b8d3dd8c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e28fd64c2f2b27577109a984e6ab82f5f0fcb296.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e28fd64c2f2b27577109a984e6ab82f5f0fcb296.hip deleted file mode 100644 index cfeae1a8441f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e28fd64c2f2b27577109a984e6ab82f5f0fcb296.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2b629c37cf94134693ce455b8c88b72a39df7fe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2b629c37cf94134693ce455b8c88b72a39df7fe.hip deleted file mode 100644 index 711f8949ebb2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2b629c37cf94134693ce455b8c88b72a39df7fe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2bf6805a489739abb77c13173d57723e9304afa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2bf6805a489739abb77c13173d57723e9304afa.hip deleted file mode 100644 index 909734e02eaa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2bf6805a489739abb77c13173d57723e9304afa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2c9f955f227430c6224ebc347649386be7f01eb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2c9f955f227430c6224ebc347649386be7f01eb.hip deleted file mode 100644 index fc0ed99b6c01..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2c9f955f227430c6224ebc347649386be7f01eb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2deafd2f36cee29109fb824e0135407453adcfe.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2deafd2f36cee29109fb824e0135407453adcfe.hip deleted file mode 100644 index 73401b787b4d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e2deafd2f36cee29109fb824e0135407453adcfe.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e3015c5d50481547aa5754d042d9d7040cf1c7ff.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e3015c5d50481547aa5754d042d9d7040cf1c7ff.hip deleted file mode 100644 index 6455762cb947..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e3015c5d50481547aa5754d042d9d7040cf1c7ff.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e307a1b0d5a8f94e0a0f4032f401d20b4b643523.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e307a1b0d5a8f94e0a0f4032f401d20b4b643523.hip deleted file mode 100644 index 3d43ac4cf01f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e307a1b0d5a8f94e0a0f4032f401d20b4b643523.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e334e691714f0b99773c2ac515ed82de0f387065.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e334e691714f0b99773c2ac515ed82de0f387065.hip deleted file mode 100644 index 3839ba35429d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e334e691714f0b99773c2ac515ed82de0f387065.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e34b7e452a4db74189334697e3a240ad68085f0e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e34b7e452a4db74189334697e3a240ad68085f0e.hip deleted file mode 100644 index b6268ae2e48e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e34b7e452a4db74189334697e3a240ad68085f0e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e389d0e4442cd8304081892ddc75043e68a6398c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e389d0e4442cd8304081892ddc75043e68a6398c.hip deleted file mode 100644 index 9e34eaf3da34..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e389d0e4442cd8304081892ddc75043e68a6398c.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e465193d97d43237c22c04478ca5833011d8dc8b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e465193d97d43237c22c04478ca5833011d8dc8b.hip deleted file mode 100644 index e0219c5cd464..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e465193d97d43237c22c04478ca5833011d8dc8b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e477abef05ff37ec27705eda51896e2aa3a04966.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e477abef05ff37ec27705eda51896e2aa3a04966.hip deleted file mode 100644 index 2adad7223516..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e477abef05ff37ec27705eda51896e2aa3a04966.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e4d9a2396ceccdadab24602f30e9070901a76dc7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e4d9a2396ceccdadab24602f30e9070901a76dc7.hip deleted file mode 100644 index efd272520172..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e4d9a2396ceccdadab24602f30e9070901a76dc7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e502730dea6987e2c038446c448aa08bdcc23113.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e502730dea6987e2c038446c448aa08bdcc23113.hip deleted file mode 100644 index 57155cc00fcc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e502730dea6987e2c038446c448aa08bdcc23113.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e514c6b4bc75d95a150104a17972abae77cb47ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e514c6b4bc75d95a150104a17972abae77cb47ed.hip deleted file mode 100644 index 6a5b2c6d13c9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e514c6b4bc75d95a150104a17972abae77cb47ed.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e52e3053f30f780f346fa6b7a836ad2554cb85df.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e52e3053f30f780f346fa6b7a836ad2554cb85df.hip deleted file mode 100644 index d1402916ab42..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e52e3053f30f780f346fa6b7a836ad2554cb85df.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e56757fb17f5e94a6ba1fb14540a68c36d571159.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e56757fb17f5e94a6ba1fb14540a68c36d571159.hip deleted file mode 100644 index 2635cbc89366..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e56757fb17f5e94a6ba1fb14540a68c36d571159.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e578ec9e09d3b78dca6b5bf0be1538657f02f319.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e578ec9e09d3b78dca6b5bf0be1538657f02f319.hip deleted file mode 100644 index 35c85a1f9d39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e578ec9e09d3b78dca6b5bf0be1538657f02f319.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5935fbda313d3518f142f43d46f56c600f69286.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5935fbda313d3518f142f43d46f56c600f69286.hip deleted file mode 100644 index 9c8ff3e0f0ea..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5935fbda313d3518f142f43d46f56c600f69286.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b2bb9f8466de1ad5210e4c39ee7b8ecacdffa9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b2bb9f8466de1ad5210e4c39ee7b8ecacdffa9.hip deleted file mode 100644 index c3f716b6782d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b2bb9f8466de1ad5210e4c39ee7b8ecacdffa9.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b65fc519ea7cfcd19f7eddbc3acad6842ff558.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b65fc519ea7cfcd19f7eddbc3acad6842ff558.hip deleted file mode 100644 index 9287b1cfd179..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5b65fc519ea7cfcd19f7eddbc3acad6842ff558.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5c5079636a4a31a849ce8a5af89d50330a74628.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5c5079636a4a31a849ce8a5af89d50330a74628.hip deleted file mode 100644 index 04d0a50da47d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5c5079636a4a31a849ce8a5af89d50330a74628.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5ccd5f7ddc894b2717112cbfc766804e02b7bd1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5ccd5f7ddc894b2717112cbfc766804e02b7bd1.hip deleted file mode 100644 index d98d14f90e26..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e5ccd5f7ddc894b2717112cbfc766804e02b7bd1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e618fb4e529104fc90069c8779ce5463460bd516.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e618fb4e529104fc90069c8779ce5463460bd516.hip deleted file mode 100644 index 6c66a8aba8af..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e618fb4e529104fc90069c8779ce5463460bd516.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e638053e01268a4c5883620fc6a9901951e2e01a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e638053e01268a4c5883620fc6a9901951e2e01a.hip deleted file mode 100644 index d8b3b8550470..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e638053e01268a4c5883620fc6a9901951e2e01a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e639a1e84faa98477b05df71d363b9ff0f9b2760.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e639a1e84faa98477b05df71d363b9ff0f9b2760.hip deleted file mode 100644 index c85bc59841b0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e639a1e84faa98477b05df71d363b9ff0f9b2760.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e68a9e05debd456a9975953f7b0d510e7a0f6978.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e68a9e05debd456a9975953f7b0d510e7a0f6978.hip deleted file mode 100644 index 7e48c597d5e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e68a9e05debd456a9975953f7b0d510e7a0f6978.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6973d75297bd2c3432a7c88e8a9ee1c9ae693bf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6973d75297bd2c3432a7c88e8a9ee1c9ae693bf.hip deleted file mode 100644 index 3a2d6375b87f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6973d75297bd2c3432a7c88e8a9ee1c9ae693bf.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6b53fb8d81148ff384d31a703bb4c2e7a5a33af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6b53fb8d81148ff384d31a703bb4c2e7a5a33af.hip deleted file mode 100644 index 18cf40ec5aa1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6b53fb8d81148ff384d31a703bb4c2e7a5a33af.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e0ec1db1ea308e226f675e68e29b839e41b252.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e0ec1db1ea308e226f675e68e29b839e41b252.hip deleted file mode 100644 index 0d7760b463a6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e0ec1db1ea308e226f675e68e29b839e41b252.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e6b10e73733716e71ebf5a53703fb935fc5e02.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e6b10e73733716e71ebf5a53703fb935fc5e02.hip deleted file mode 100644 index 8acda06dd901..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e6e6b10e73733716e71ebf5a53703fb935fc5e02.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7153f9a9b0b7c54ddf2debbe297efcffbb4fcfa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7153f9a9b0b7c54ddf2debbe297efcffbb4fcfa.hip deleted file mode 100644 index 507e4ca2ff1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7153f9a9b0b7c54ddf2debbe297efcffbb4fcfa.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 128, - true, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<128, ck_tile::bf16_t, true, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e73a776ae4ba68c23acab1a5a6381684051738ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e73a776ae4ba68c23acab1a5a6381684051738ab.hip deleted file mode 100644 index d0c25a7283f6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e73a776ae4ba68c23acab1a5a6381684051738ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75c757c67aa23cb88e1aced6fcf36b7b28391db.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75c757c67aa23cb88e1aced6fcf36b7b28391db.hip deleted file mode 100644 index 1e8b6398d9e2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75c757c67aa23cb88e1aced6fcf36b7b28391db.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75d492ac3a6ab75648056bcf26250a4aa929cfd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75d492ac3a6ab75648056bcf26250a4aa929cfd.hip deleted file mode 100644 index b9b534ef39a4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e75d492ac3a6ab75648056bcf26250a4aa929cfd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e76879f8ff4796f48ad87ff8003f4f6e6adca9a0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e76879f8ff4796f48ad87ff8003f4f6e6adca9a0.hip deleted file mode 100644 index f5253a7b60c1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e76879f8ff4796f48ad87ff8003f4f6e6adca9a0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7ae1294b6dea5c8b93c2b814fa7460c4047105b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7ae1294b6dea5c8b93c2b814fa7460c4047105b.hip deleted file mode 100644 index 982c1f7ca71c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7ae1294b6dea5c8b93c2b814fa7460c4047105b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7b2eb64b66d46359fab44333c2c484f4c9dd5de.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7b2eb64b66d46359fab44333c2c484f4c9dd5de.hip deleted file mode 100644 index aa64d12f513b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7b2eb64b66d46359fab44333c2c484f4c9dd5de.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7c0a99e949baa5f3a7ee2d6e84427982f82f76d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7c0a99e949baa5f3a7ee2d6e84427982f82f76d.hip deleted file mode 100644 index b8bacaa883de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7c0a99e949baa5f3a7ee2d6e84427982f82f76d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7d37e7ee96c392fa24c02a9143438a3a7d05741.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7d37e7ee96c392fa24c02a9143438a3a7d05741.hip deleted file mode 100644 index dc213af2ce75..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7d37e7ee96c392fa24c02a9143438a3a7d05741.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7de729aa50c10d8101ef504138c3769e3286753.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7de729aa50c10d8101ef504138c3769e3286753.hip deleted file mode 100644 index 35e14e0d3c31..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e7de729aa50c10d8101ef504138c3769e3286753.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e83c604d1b8260958becd1c7c209745ff9151715.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e83c604d1b8260958becd1c7c209745ff9151715.hip deleted file mode 100644 index d78d224e9747..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e83c604d1b8260958becd1c7c209745ff9151715.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e89bcea4393593313d18a4aa6dcb44cd75bc828d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e89bcea4393593313d18a4aa6dcb44cd75bc828d.hip deleted file mode 100644 index 5e069502bc59..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e89bcea4393593313d18a4aa6dcb44cd75bc828d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8a9427f34bbf5ddb28a39161acc36806e68f2d0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8a9427f34bbf5ddb28a39161acc36806e68f2d0.hip deleted file mode 100644 index 5fd258cffbb2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8a9427f34bbf5ddb28a39161acc36806e68f2d0.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::fp16_t, - false, - false, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d8fe5f4f8641998b8b805a20b2ca92d019ee59.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d8fe5f4f8641998b8b805a20b2ca92d019ee59.hip deleted file mode 100644 index 403288f4f73f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d8fe5f4f8641998b8b805a20b2ca92d019ee59.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d9b65558398c0c10127b560807578ef117d7ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d9b65558398c0c10127b560807578ef117d7ed.hip deleted file mode 100644 index 1827de25165b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e8d9b65558398c0c10127b560807578ef117d7ed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e907e8d1089557dfcc95a05160be5092e9119a53.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e907e8d1089557dfcc95a05160be5092e9119a53.hip deleted file mode 100644 index 88fe0d2cf3e4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e907e8d1089557dfcc95a05160be5092e9119a53.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e95e3908479965856843317c8b0c42a6961dfd23.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e95e3908479965856843317c8b0c42a6961dfd23.hip deleted file mode 100644 index f53424f68a44..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e95e3908479965856843317c8b0c42a6961dfd23.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e986d5f8d5591f3e0f1cdfad19c38c420fd93023.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e986d5f8d5591f3e0f1cdfad19c38c420fd93023.hip deleted file mode 100644 index cc361a2b8812..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e986d5f8d5591f3e0f1cdfad19c38c420fd93023.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b04e6d5527ba0b8089ba8bdd264e2d5759338b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b04e6d5527ba0b8089ba8bdd264e2d5759338b.hip deleted file mode 100644 index 3cc03123fc08..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b04e6d5527ba0b8089ba8bdd264e2d5759338b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b53fa68641f45baabf40b7cfb8b35a9a1b9c7f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b53fa68641f45baabf40b7cfb8b35a9a1b9c7f.hip deleted file mode 100644 index b7ff6e331187..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_e9b53fa68641f45baabf40b7cfb8b35a9a1b9c7f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea077e68dbc1bed2dd20a5f4dd35e0cad6330ee4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea077e68dbc1bed2dd20a5f4dd35e0cad6330ee4.hip deleted file mode 100644 index bf1d98875784..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea077e68dbc1bed2dd20a5f4dd35e0cad6330ee4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea591185b1c5f521023e250a26f742984255b241.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea591185b1c5f521023e250a26f742984255b241.hip deleted file mode 100644 index 9bdfda3e22a0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea591185b1c5f521023e250a26f742984255b241.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea62567e9ea16771d8445464c38f5a2931cb355a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea62567e9ea16771d8445464c38f5a2931cb355a.hip deleted file mode 100644 index ffe6de2cdf76..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea62567e9ea16771d8445464c38f5a2931cb355a.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea6a6d4cc262ea838dbb83ee747112f95fa297bc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea6a6d4cc262ea838dbb83ee747112f95fa297bc.hip deleted file mode 100644 index 7a0bcef89fbf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ea6a6d4cc262ea838dbb83ee747112f95fa297bc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eab6cdc59bf216f7045f0cf5f221bb91ec415cd2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eab6cdc59bf216f7045f0cf5f221bb91ec415cd2.hip deleted file mode 100644 index f77d15767190..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eab6cdc59bf216f7045f0cf5f221bb91ec415cd2.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac353f963c52624cf79e82cc2b2c02eed94b677.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac353f963c52624cf79e82cc2b2c02eed94b677.hip deleted file mode 100644 index ae04c653a625..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac353f963c52624cf79e82cc2b2c02eed94b677.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac5952f46f4f2bf06257b00661774eeed48a323.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac5952f46f4f2bf06257b00661774eeed48a323.hip deleted file mode 100644 index d0b9029b79e1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eac5952f46f4f2bf06257b00661774eeed48a323.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, false, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eb278488b2cca114adca5e4614d86f92447f937a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eb278488b2cca114adca5e4614d86f92447f937a.hip deleted file mode 100644 index 53912552f8ce..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eb278488b2cca114adca5e4614d86f92447f937a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb241b947a0adfc8e50c5d71765c14af24593ae.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb241b947a0adfc8e50c5d71765c14af24593ae.hip deleted file mode 100644 index ff1b8b1812cb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb241b947a0adfc8e50c5d71765c14af24593ae.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb9abf5b09e63cbe76390bb46ff7cbefb3141f0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb9abf5b09e63cbe76390bb46ff7cbefb3141f0.hip deleted file mode 100644 index 3750b4dfaa39..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ebb9abf5b09e63cbe76390bb46ff7cbefb3141f0.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec171210efd217c07d357fcf42e5372ad7e9abab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec171210efd217c07d357fcf42e5372ad7e9abab.hip deleted file mode 100644 index d1d93065719e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec171210efd217c07d357fcf42e5372ad7e9abab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec3deb1382003ac010d9bc1c59d1878d3ec7a727.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec3deb1382003ac010d9bc1c59d1878d3ec7a727.hip deleted file mode 100644 index 7df2d85c8eb6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec3deb1382003ac010d9bc1c59d1878d3ec7a727.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec51d24ab5f24e003ed6751ae8ae5b327892b15a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec51d24ab5f24e003ed6751ae8ae5b327892b15a.hip deleted file mode 100644 index d2cfc01a8428..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec51d24ab5f24e003ed6751ae8ae5b327892b15a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7ec8d547ee9713aa3b5b667f22cdcaa8f62b2d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7ec8d547ee9713aa3b5b667f22cdcaa8f62b2d.hip deleted file mode 100644 index 6115f4218289..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7ec8d547ee9713aa3b5b667f22cdcaa8f62b2d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7fc24902b1ebd8f2bf8088b0ecf6de8be8362d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7fc24902b1ebd8f2bf8088b0ecf6de8be8362d.hip deleted file mode 100644 index 49a98116856a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec7fc24902b1ebd8f2bf8088b0ecf6de8be8362d.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec9f63a538940e5ace02ae5b5ddc01f730adac4d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec9f63a538940e5ace02ae5b5ddc01f730adac4d.hip deleted file mode 100644 index e46fb0817921..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ec9f63a538940e5ace02ae5b5ddc01f730adac4d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eca613eaa8471ad7da66d2f8f2b8e07f6e02b467.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eca613eaa8471ad7da66d2f8f2b8e07f6e02b467.hip deleted file mode 100644 index 160e1e545d2d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eca613eaa8471ad7da66d2f8f2b8e07f6e02b467.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ecd7dec90b3c62bf3a30bd75d3c6869529a06b01.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ecd7dec90b3c62bf3a30bd75d3c6869529a06b01.hip deleted file mode 100644 index 14f53fba4a1f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ecd7dec90b3c62bf3a30bd75d3c6869529a06b01.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ece60111633db08f765b3c7cd5cd768cbd030255.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ece60111633db08f765b3c7cd5cd768cbd030255.hip deleted file mode 100644 index e3dd99261443..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ece60111633db08f765b3c7cd5cd768cbd030255.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed37ba962e0288e2840eb0925d016b5a7e3b3164.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed37ba962e0288e2840eb0925d016b5a7e3b3164.hip deleted file mode 100644 index 756e8c4ed621..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed37ba962e0288e2840eb0925d016b5a7e3b3164.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed6bdf67720e938d538a867548ac3579b8238169.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed6bdf67720e938d538a867548ac3579b8238169.hip deleted file mode 100644 index 1b1b714a443e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ed6bdf67720e938d538a867548ac3579b8238169.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ede81dbc4cb208ef6e684c76ba1eb451d37fe10c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ede81dbc4cb208ef6e684c76ba1eb451d37fe10c.hip deleted file mode 100644 index 2082ecf14643..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ede81dbc4cb208ef6e684c76ba1eb451d37fe10c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee1a43f2210a8d1e5623411c95c33424cee5e747.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee1a43f2210a8d1e5623411c95c33424cee5e747.hip deleted file mode 100644 index 21e4b98530be..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee1a43f2210a8d1e5623411c95c33424cee5e747.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee239db5a67c23a383590a651f0d8a0be43a13c7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee239db5a67c23a383590a651f0d8a0be43a13c7.hip deleted file mode 100644 index ab1e19dfddb7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee239db5a67c23a383590a651f0d8a0be43a13c7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee8e709eec7aef1fa681053c6d2969a5ff18c45c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee8e709eec7aef1fa681053c6d2969a5ff18c45c.hip deleted file mode 100644 index 17a2d575fd08..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee8e709eec7aef1fa681053c6d2969a5ff18c45c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee974931e65d6b16b7c868d462b95dcae20b7513.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee974931e65d6b16b7c868d462b95dcae20b7513.hip deleted file mode 100644 index 5a581e2b944f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ee974931e65d6b16b7c868d462b95dcae20b7513.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eeb0e96b759e18cf703cfab0cda1385726f6e0a1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eeb0e96b759e18cf703cfab0cda1385726f6e0a1.hip deleted file mode 100644 index e373e6255556..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eeb0e96b759e18cf703cfab0cda1385726f6e0a1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eee408cf9456ff977aa7d12345e9b2f1e60639f1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eee408cf9456ff977aa7d12345e9b2f1e60639f1.hip deleted file mode 100644 index ec42f7faf0ec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_eee408cf9456ff977aa7d12345e9b2f1e60639f1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef2ebb4a86e7ed0001de9c5e607b66fe8877409f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef2ebb4a86e7ed0001de9c5e607b66fe8877409f.hip deleted file mode 100644 index f4cb467ac3aa..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef2ebb4a86e7ed0001de9c5e607b66fe8877409f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef40f0acf1885096efb840ec5600ec421c4db331.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef40f0acf1885096efb840ec5600ec421c4db331.hip deleted file mode 100644 index 271b5666abe4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef40f0acf1885096efb840ec5600ec421c4db331.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - false, false>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::fp16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, false, false, false, false, false, false>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef5421703cbfa63a58ec02701e245d479a1fbfc1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef5421703cbfa63a58ec02701e245d479a1fbfc1.hip deleted file mode 100644 index 5b62ec886164..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef5421703cbfa63a58ec02701e245d479a1fbfc1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef7cc2aa1ffd38298b52764a93cd1271b4d92f8d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef7cc2aa1ffd38298b52764a93cd1271b4d92f8d.hip deleted file mode 100644 index 778bfb8d4608..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ef7cc2aa1ffd38298b52764a93cd1271b4d92f8d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efaa0cb33c71cb8ca7b83dd0e7a6c7b01f6b50a9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efaa0cb33c71cb8ca7b83dd0e7a6c7b01f6b50a9.hip deleted file mode 100644 index 6c9757987822..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efaa0cb33c71cb8ca7b83dd0e7a6c7b01f6b50a9.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 256, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<256, ck_tile::bf16_t, false, true, false>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efb9e7d9af47cdf79f15f674f8976c05f08b0ce8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efb9e7d9af47cdf79f15f674f8976c05f08b0ce8.hip deleted file mode 100644 index 3d8e689b434b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efb9e7d9af47cdf79f15f674f8976c05f08b0ce8.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efc6a7b25710f0626c3af534111b161e1459d2e1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efc6a7b25710f0626c3af534111b161e1459d2e1.hip deleted file mode 100644 index eb0e08d17f00..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_efc6a7b25710f0626c3af534111b161e1459d2e1.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f01468c62c878295443981662e037ec5213cf7a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f01468c62c878295443981662e037ec5213cf7a3.hip deleted file mode 100644 index c1572e5e14af..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f01468c62c878295443981662e037ec5213cf7a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f020134822739be6fa0bb3d98e9dec79f025324a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f020134822739be6fa0bb3d98e9dec79f025324a.hip deleted file mode 100644 index cd4d0b3d5a93..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f020134822739be6fa0bb3d98e9dec79f025324a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0209426a8e6bfeef7d8ae7b16db791888142298.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0209426a8e6bfeef7d8ae7b16db791888142298.hip deleted file mode 100644 index fd457d4544e9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0209426a8e6bfeef7d8ae7b16db791888142298.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f028af9e5e3c25800dde938e991aaab4fc1d64aa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f028af9e5e3c25800dde938e991aaab4fc1d64aa.hip deleted file mode 100644 index 6be6bb016b28..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f028af9e5e3c25800dde938e991aaab4fc1d64aa.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f053c9c32518b895daaa3521827f37af78836fb8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f053c9c32518b895daaa3521827f37af78836fb8.hip deleted file mode 100644 index 213bad825ec0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f053c9c32518b895daaa3521827f37af78836fb8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::fp16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f069b38b26c30bc770f74c856e47eb498f5818e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f069b38b26c30bc770f74c856e47eb498f5818e7.hip deleted file mode 100644 index 8fc65e43f859..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f069b38b26c30bc770f74c856e47eb498f5818e7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0cad48d9bc80d58705ea60eb2dda4baad68cedb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0cad48d9bc80d58705ea60eb2dda4baad68cedb.hip deleted file mode 100644 index 864b348cc895..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f0cad48d9bc80d58705ea60eb2dda4baad68cedb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1246d1013d954a9316f4432c986d3be9459c548.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1246d1013d954a9316f4432c986d3be9459c548.hip deleted file mode 100644 index 7dc3a42f6e8b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1246d1013d954a9316f4432c986d3be9459c548.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f12f1f1b679cabab04218037ef370d2c7e1fe332.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f12f1f1b679cabab04218037ef370d2c7e1fe332.hip deleted file mode 100644 index 60c9e624df54..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f12f1f1b679cabab04218037ef370d2c7e1fe332.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f15c41ddb04ec7f80235bb3db19198dd6b699713.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f15c41ddb04ec7f80235bb3db19198dd6b699713.hip deleted file mode 100644 index e0107ade6e6b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f15c41ddb04ec7f80235bb3db19198dd6b699713.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f18c74becc24a93427d9c0838784e9b6caad6e81.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f18c74becc24a93427d9c0838784e9b6caad6e81.hip deleted file mode 100644 index 36b0bc3cce91..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f18c74becc24a93427d9c0838784e9b6caad6e81.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1ecc90ad7b86791a9e6f73a582aeff30f393804.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1ecc90ad7b86791a9e6f73a582aeff30f393804.hip deleted file mode 100644 index 68533e7f9ef9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f1ecc90ad7b86791a9e6f73a582aeff30f393804.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f21596e8c608a795ff971aea8e199db9e72b65d7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f21596e8c608a795ff971aea8e199db9e72b65d7.hip deleted file mode 100644 index e40d17a03d40..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f21596e8c608a795ff971aea8e199db9e72b65d7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24bd5b92ce6bba640b8ec6b4e53fe35902c5572.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24bd5b92ce6bba640b8ec6b4e53fe35902c5572.hip deleted file mode 100644 index bff2fa941435..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24bd5b92ce6bba640b8ec6b4e53fe35902c5572.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24d42e820adc1a26a428d59df7ffdd7f8580176.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24d42e820adc1a26a428d59df7ffdd7f8580176.hip deleted file mode 100644 index 2837920826d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24d42e820adc1a26a428d59df7ffdd7f8580176.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24f26e45d5cf567d29fbe375fbf8abdec39186f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24f26e45d5cf567d29fbe375fbf8abdec39186f.hip deleted file mode 100644 index 4de3723f948b..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f24f26e45d5cf567d29fbe375fbf8abdec39186f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f25b87c435bc5d7d85d738f3fdf68947d79f5a77.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f25b87c435bc5d7d85d738f3fdf68947d79f5a77.hip deleted file mode 100644 index d51c980e8903..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f25b87c435bc5d7d85d738f3fdf68947d79f5a77.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f280e1639680ac1e5830a21f921bfe2cf364ef42.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f280e1639680ac1e5830a21f921bfe2cf364ef42.hip deleted file mode 100644 index 9aa3ab035db8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f280e1639680ac1e5830a21f921bfe2cf364ef42.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f2da112b1e07c44fc8a7f19368da203f6935049c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f2da112b1e07c44fc8a7f19368da203f6935049c.hip deleted file mode 100644 index 927922b0bda9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f2da112b1e07c44fc8a7f19368da203f6935049c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f30316cfe49323638f71ba688dd8ff9b2266b335.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f30316cfe49323638f71ba688dd8ff9b2266b335.hip deleted file mode 100644 index 70a90bcb6901..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f30316cfe49323638f71ba688dd8ff9b2266b335.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3193ea266f3718398bc5622f8bc7042c3527a42.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3193ea266f3718398bc5622f8bc7042c3527a42.hip deleted file mode 100644 index dbbb66f95a2a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3193ea266f3718398bc5622f8bc7042c3527a42.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f34fdb8294257d951dcc9c4fa7ecf1192568b91b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f34fdb8294257d951dcc9c4fa7ecf1192568b91b.hip deleted file mode 100644 index e9b21bb10453..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f34fdb8294257d951dcc9c4fa7ecf1192568b91b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f36aaa63ed42a578b953ebd614318d44cf44e8a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f36aaa63ed42a578b953ebd614318d44cf44e8a3.hip deleted file mode 100644 index 029c41aa8323..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f36aaa63ed42a578b953ebd614318d44cf44e8a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f395bec57c3b2e6e169134dd8d20b287d7405134.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f395bec57c3b2e6e169134dd8d20b287d7405134.hip deleted file mode 100644 index c7c012e4a802..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f395bec57c3b2e6e169134dd8d20b287d7405134.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3bf7ef503bb026258b3ec3d82d3ef1443046964.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3bf7ef503bb026258b3ec3d82d3ef1443046964.hip deleted file mode 100644 index 8053a28e2e83..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3bf7ef503bb026258b3ec3d82d3ef1443046964.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3d0166931e4406873d8f552a5d5b61fde2391a3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3d0166931e4406873d8f552a5d5b61fde2391a3.hip deleted file mode 100644 index 5cb10d9b88de..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3d0166931e4406873d8f552a5d5b61fde2391a3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3fd08d56f8a9be1a8dd104cdb1ac58e283b5064.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3fd08d56f8a9be1a8dd104cdb1ac58e283b5064.hip deleted file mode 100644 index 1dc20d68b2e3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3fd08d56f8a9be1a8dd104cdb1ac58e283b5064.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3ff73f82aee3184849d04c2364eaa45c6d0de9c.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3ff73f82aee3184849d04c2364eaa45c6d0de9c.hip deleted file mode 100644 index d6eff9a6fe4c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f3ff73f82aee3184849d04c2364eaa45c6d0de9c.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f42cf0e5fe479690883507028748b0cd3dc83cbb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f42cf0e5fe479690883507028748b0cd3dc83cbb.hip deleted file mode 100644 index ec50bf67d0a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f42cf0e5fe479690883507028748b0cd3dc83cbb.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4658c32d562f9d60c5ca1262a2e0df2375063bb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4658c32d562f9d60c5ca1262a2e0df2375063bb.hip deleted file mode 100644 index 5900808dbbd4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4658c32d562f9d60c5ca1262a2e0df2375063bb.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f48f8b681a405bfeba5aadaef40f32367ec5cd2b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f48f8b681a405bfeba5aadaef40f32367ec5cd2b.hip deleted file mode 100644 index db360db1a24f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f48f8b681a405bfeba5aadaef40f32367ec5cd2b.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 128, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<128, - ck_tile::fp16_t, - false, - false, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4900c0a5c0d03dc17d7a907ab40652d9920e756.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4900c0a5c0d03dc17d7a907ab40652d9920e756.hip deleted file mode 100644 index 85ae3594e285..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4900c0a5c0d03dc17d7a907ab40652d9920e756.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4a6438394dd3427f29aa0bbe58ad1f797c3c38d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4a6438394dd3427f29aa0bbe58ad1f797c3c38d.hip deleted file mode 100644 index 2e4112941cdf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4a6438394dd3427f29aa0bbe58ad1f797c3c38d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4b87f983a5e84582efa1663f84da76cf60b5f6f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4b87f983a5e84582efa1663f84da76cf60b5f6f.hip deleted file mode 100644 index a92c5be29573..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4b87f983a5e84582efa1663f84da76cf60b5f6f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4c803838f5644ccc6f04f7c8a6233fed0b6639e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4c803838f5644ccc6f04f7c8a6233fed0b6639e.hip deleted file mode 100644 index 223a213ffaac..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4c803838f5644ccc6f04f7c8a6233fed0b6639e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4df1cbfbaf67705820f125b474469ad7ebab0c0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4df1cbfbaf67705820f125b474469ad7ebab0c0.hip deleted file mode 100644 index 2cc4ee1d2f8d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f4df1cbfbaf67705820f125b474469ad7ebab0c0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f50fa4ea674a590d0a817367ad9915a5fce20c51.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f50fa4ea674a590d0a817367ad9915a5fce20c51.hip deleted file mode 100644 index ecddb9f6b464..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f50fa4ea674a590d0a817367ad9915a5fce20c51.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f51f1a11f778d99a00aa5959a3e58a41fcbfb1e3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f51f1a11f778d99a00aa5959a3e58a41fcbfb1e3.hip deleted file mode 100644 index eadfc2864ab8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f51f1a11f778d99a00aa5959a3e58a41fcbfb1e3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f525b59df454ccf53da6cb201e0aa8d09f52a2ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f525b59df454ccf53da6cb201e0aa8d09f52a2ad.hip deleted file mode 100644 index 6e20a2bd7e1e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f525b59df454ccf53da6cb201e0aa8d09f52a2ad.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f57f84892e2a8496169b7406e63b0d4f5aa63aaf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f57f84892e2a8496169b7406e63b0d4f5aa63aaf.hip deleted file mode 100644 index 82d07ba47990..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f57f84892e2a8496169b7406e63b0d4f5aa63aaf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5803aadd93e33567aa6b23100ce4fbb6c040dd6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5803aadd93e33567aa6b23100ce4fbb6c040dd6.hip deleted file mode 100644 index 985fe7298436..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5803aadd93e33567aa6b23100ce4fbb6c040dd6.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5f1797f6b672a55476348571ce17645c8a62869.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5f1797f6b672a55476348571ce17645c8a62869.hip deleted file mode 100644 index a6173bb54fb5..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f5f1797f6b672a55476348571ce17645c8a62869.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6566441ac3074578cfe45758ba0583c0da0a5ab.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6566441ac3074578cfe45758ba0583c0da0a5ab.hip deleted file mode 100644 index 67ddf754dafc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6566441ac3074578cfe45758ba0583c0da0a5ab.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f672bf80a78885428b2c02e522426470653a7351.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f672bf80a78885428b2c02e522426470653a7351.hip deleted file mode 100644 index 62da3abbfb5e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f672bf80a78885428b2c02e522426470653a7351.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f682399cd6412fed6a1141296a7e4d42078f7b29.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f682399cd6412fed6a1141296a7e4d42078f7b29.hip deleted file mode 100644 index 5a5c93702d7d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f682399cd6412fed6a1141296a7e4d42078f7b29.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6856ca950bcf173571766c3f04de4163be0402e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6856ca950bcf173571766c3f04de4163be0402e.hip deleted file mode 100644 index 45a2bd5fc9af..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6856ca950bcf173571766c3f04de4163be0402e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69548d6cced86c21c09c6475237a0cb926df0ed.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69548d6cced86c21c09c6475237a0cb926df0ed.hip deleted file mode 100644 index 1d2c930df99a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69548d6cced86c21c09c6475237a0cb926df0ed.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69878f4ca8cfe6b8d8748766f66a1ef8eab20ad.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69878f4ca8cfe6b8d8748766f66a1ef8eab20ad.hip deleted file mode 100644 index 9c3ff824c9ee..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f69878f4ca8cfe6b8d8748766f66a1ef8eab20ad.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6f102a388ffb05c690a20a29cfe0b35a35eed61.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6f102a388ffb05c690a20a29cfe0b35a35eed61.hip deleted file mode 100644 index 8d1d9830b17d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f6f102a388ffb05c690a20a29cfe0b35a35eed61.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7035f4bfd8f2f427720a07e3c311bccc1dba683.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7035f4bfd8f2f427720a07e3c311bccc1dba683.hip deleted file mode 100644 index 92b7645f8d66..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7035f4bfd8f2f427720a07e3c311bccc1dba683.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f71f96ce4dcc7f789a8ace73c230c203b05ff6dc.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f71f96ce4dcc7f789a8ace73c230c203b05ff6dc.hip deleted file mode 100644 index b47a9b453bfc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f71f96ce4dcc7f789a8ace73c230c203b05ff6dc.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f727911254904ce4341e4ff5f8bafc430b8cfbbf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f727911254904ce4341e4ff5f8bafc430b8cfbbf.hip deleted file mode 100644 index f41a1ead45e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f727911254904ce4341e4ff5f8bafc430b8cfbbf.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::bf16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f731289837f915e2aec1bd01eef1b3c1b099864d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f731289837f915e2aec1bd01eef1b3c1b099864d.hip deleted file mode 100644 index a530df9ff2fe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f731289837f915e2aec1bd01eef1b3c1b099864d.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 32, - false, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<32, - ck_tile::bf16_t, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f79def2b4edf6d18f6ef1d6b141f9e0435441f6a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f79def2b4edf6d18f6ef1d6b141f9e0435441f6a.hip deleted file mode 100644 index 50441617f1a1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f79def2b4edf6d18f6ef1d6b141f9e0435441f6a.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, false,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7aa9c39b06e55bf4bc9f9a2a0fb075c9d4e69ce.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7aa9c39b06e55bf4bc9f9a2a0fb075c9d4e69ce.hip deleted file mode 100644 index ee924640e3ec..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7aa9c39b06e55bf4bc9f9a2a0fb075c9d4e69ce.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7cf08242b3fb1c643d4149bec985b667b9d28fa.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7cf08242b3fb1c643d4149bec985b667b9d28fa.hip deleted file mode 100644 index 87dd684ce707..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f7cf08242b3fb1c643d4149bec985b667b9d28fa.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f851da732f397624717160f89271514bc334b59b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f851da732f397624717160f89271514bc334b59b.hip deleted file mode 100644 index 2d9cc06d4caf..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f851da732f397624717160f89271514bc334b59b.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f861d8693f82d22e2c5b1abbcbae5f30f4433e5e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f861d8693f82d22e2c5b1abbcbae5f30f4433e5e.hip deleted file mode 100644 index 105e0f9bde4f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f861d8693f82d22e2c5b1abbcbae5f30f4433e5e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87790f260630f312b84888dcbdf849ce130ae59.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87790f260630f312b84888dcbdf849ce130ae59.hip deleted file mode 100644 index 4abd9dc4272a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87790f260630f312b84888dcbdf849ce130ae59.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87991cb7787a29d3ce4711b4ce04c5fb6a14ca9.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87991cb7787a29d3ce4711b4ce04c5fb6a14ca9.hip deleted file mode 100644 index 83789c37fc34..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f87991cb7787a29d3ce4711b4ce04c5fb6a14ca9.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f90410c26d7649e21e2ae5e32e7af89d84d2ea70.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f90410c26d7649e21e2ae5e32e7af89d84d2ea70.hip deleted file mode 100644 index 6959c135d2e4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f90410c26d7649e21e2ae5e32e7af89d84d2ea70.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f92e9a82c879051d6fe3c42108f8a574187704af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f92e9a82c879051d6fe3c42108f8a574187704af.hip deleted file mode 100644 index 3086388a0d6a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f92e9a82c879051d6fe3c42108f8a574187704af.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bc23b8a4f1e0fc5c5756c4e1c835bf59dea09.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bc23b8a4f1e0fc5c5756c4e1c835bf59dea09.hip deleted file mode 100644 index 9bac288ffe43..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bc23b8a4f1e0fc5c5756c4e1c835bf59dea09.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bf815b520a9d9e17b43bf9d7fb870751b6225.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bf815b520a9d9e17b43bf9d7fb870751b6225.hip deleted file mode 100644 index 589eaecd9cff..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f93bf815b520a9d9e17b43bf9d7fb870751b6225.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f974b12e83e214c30995a25631d37df1478927af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f974b12e83e214c30995a25631d37df1478927af.hip deleted file mode 100644 index 0231413a7ac6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f974b12e83e214c30995a25631d37df1478927af.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::fp16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9824fb32933b27501ae8a7f43f460a2dda6a814.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9824fb32933b27501ae8a7f43f460a2dda6a814.hip deleted file mode 100644 index 0c37c9de80cc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9824fb32933b27501ae8a7f43f460a2dda6a814.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f98a6b193fec3203eaa75819f6b51aa45a48f212.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f98a6b193fec3203eaa75819f6b51aa45a48f212.hip deleted file mode 100644 index a0dc4114ef03..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f98a6b193fec3203eaa75819f6b51aa45a48f212.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9c58761c927b222112cb5cb6c9acb5d3c915785.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9c58761c927b222112cb5cb6c9acb5d3c915785.hip deleted file mode 100644 index e14383b3e0b8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_f9c58761c927b222112cb5cb6c9acb5d3c915785.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa16fa84278b489af253b52839786f94aeeac36f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa16fa84278b489af253b52839786f94aeeac36f.hip deleted file mode 100644 index 314094b09d17..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa16fa84278b489af253b52839786f94aeeac36f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa62a97675719c2e8e9bb97361b92ff1c7b9d2ef.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa62a97675719c2e8e9bb97361b92ff1c7b9d2ef.hip deleted file mode 100644 index c8195f0a1b63..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa62a97675719c2e8e9bb97361b92ff1c7b9d2ef.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa85f869a92f0482605e52019828244b12e12b44.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa85f869a92f0482605e52019828244b12e12b44.hip deleted file mode 100644 index d0233e532888..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fa85f869a92f0482605e52019828244b12e12b44.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fabdc143c29d5ca50ab1e96a814bda6d05b0d5d2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fabdc143c29d5ca50ab1e96a814bda6d05b0d5d2.hip deleted file mode 100644 index b36818947836..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fabdc143c29d5ca50ab1e96a814bda6d05b0d5d2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac5a0f98b94530befd634891e42c424bb86f0e1.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac5a0f98b94530befd634891e42c424bb86f0e1.hip deleted file mode 100644 index 67c384812c6d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac5a0f98b94530befd634891e42c424bb86f0e1.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac99c3c82b77946f6844699d2333cd532a78a26.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac99c3c82b77946f6844699d2333cd532a78a26.hip deleted file mode 100644 index 38a8b5a1fffb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fac99c3c82b77946f6844699d2333cd532a78a26.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 128, 32, 128>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<128, ck_tile::fp16_t, true,128, 128, 32, 128, 32, 128, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf56e45b2240515e97fc1bfd552eb03b6de5094.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf56e45b2240515e97fc1bfd552eb03b6de5094.hip deleted file mode 100644 index 500e8708ed3f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf56e45b2240515e97fc1bfd552eb03b6de5094.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf686067fa433cea5e95dd523846dc881eff635.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf686067fa433cea5e95dd523846dc881eff635.hip deleted file mode 100644 index 9ce35b80def4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_faf686067fa433cea5e95dd523846dc881eff635.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb2fbb135d59028afcf867c2cf08edc323565528.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb2fbb135d59028afcf867c2cf08edc323565528.hip deleted file mode 100644 index e30a1d461cdb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb2fbb135d59028afcf867c2cf08edc323565528.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c15452f9155c5966990f09432e5eb7e28e785.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c15452f9155c5966990f09432e5eb7e28e785.hip deleted file mode 100644 index 10d32f74fda0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c15452f9155c5966990f09432e5eb7e28e785.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c5f8fecfbbe16e6648becb3b5ca89fa3d8a94.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c5f8fecfbbe16e6648becb3b5ca89fa3d8a94.hip deleted file mode 100644 index 77fe95aa652e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb4c5f8fecfbbe16e6648becb3b5ca89fa3d8a94.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb5bb49928ce5515d7b297d5eadd4ec70a22d60b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb5bb49928ce5515d7b297d5eadd4ec70a22d60b.hip deleted file mode 100644 index 45a250920234..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb5bb49928ce5515d7b297d5eadd4ec70a22d60b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb79e1f9231692d736dbada062ed6821f34927bf.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb79e1f9231692d736dbada062ed6821f34927bf.hip deleted file mode 100644 index f1f09196bc69..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb79e1f9231692d736dbada062ed6821f34927bf.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb9477a613665cebcad781389ba7c5a36f51efe2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb9477a613665cebcad781389ba7c5a36f51efe2.hip deleted file mode 100644 index 3bb8ba9f058a..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fb9477a613665cebcad781389ba7c5a36f51efe2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba36678d5047ded97ee7a7ba9feb9569afdb6ea.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba36678d5047ded97ee7a7ba9feb9569afdb6ea.hip deleted file mode 100644 index 53cc0538ac89..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba36678d5047ded97ee7a7ba9feb9569afdb6ea.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba47fa8d9b5375bc408af68b67345ab9dba2eb8.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba47fa8d9b5375bc408af68b67345ab9dba2eb8.hip deleted file mode 100644 index 55cabdff7bb2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fba47fa8d9b5375bc408af68b67345ab9dba2eb8.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::bf16_t, false,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, false, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbea85b766bf0c918ee0baf24dffc6a5563d5105.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbea85b766bf0c918ee0baf24dffc6a5563d5105.hip deleted file mode 100644 index 9921145904e6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbea85b766bf0c918ee0baf24dffc6a5563d5105.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbeec221cd63adaedceec39db41ea942f99f5133.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbeec221cd63adaedceec39db41ea942f99f5133.hip deleted file mode 100644 index 09f49e460543..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fbeec221cd63adaedceec39db41ea942f99f5133.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc030b61ae20c4b7d9b2d10930a17e01e9e93328.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc030b61ae20c4b7d9b2d10930a17e01e9e93328.hip deleted file mode 100644 index d7cbb6a3f2f0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc030b61ae20c4b7d9b2d10930a17e01e9e93328.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1790325b59bd44b0a5f6cf9723a25fd845cba7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1790325b59bd44b0a5f6cf9723a25fd845cba7.hip deleted file mode 100644 index 9a28018340c9..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1790325b59bd44b0a5f6cf9723a25fd845cba7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1eb85a00017efdc610e4259d2abe935b85304f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1eb85a00017efdc610e4259d2abe935b85304f.hip deleted file mode 100644 index 1a6edf526b17..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc1eb85a00017efdc610e4259d2abe935b85304f.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5841a729099340d608e31023acbeaeade3e886.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5841a729099340d608e31023acbeaeade3e886.hip deleted file mode 100644 index c2246bfb6ac1..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5841a729099340d608e31023acbeaeade3e886.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5ebf0f2200f37ccc0849e0c3745f6e2f00111d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5ebf0f2200f37ccc0849e0c3745f6e2f00111d.hip deleted file mode 100644 index 856f565889e8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc5ebf0f2200f37ccc0849e0c3745f6e2f00111d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc7b0916744b593435d8e1e7b6d874d760cd5e3b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc7b0916744b593435d8e1e7b6d874d760cd5e3b.hip deleted file mode 100644 index 29d0b32e81bb..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc7b0916744b593435d8e1e7b6d874d760cd5e3b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc86c13e933cba40553ffba31d53aad27415ce4b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc86c13e933cba40553ffba31d53aad27415ce4b.hip deleted file mode 100644 index e9dbee4392b2..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fc86c13e933cba40553ffba31d53aad27415ce4b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb0b08e29b2e1bf181fceceb9dc416e54f52b00.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb0b08e29b2e1bf181fceceb9dc416e54f52b00.hip deleted file mode 100644 index ab6151153513..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb0b08e29b2e1bf181fceceb9dc416e54f52b00.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb6ef39c3db49f26f736d6c9221dd825409ec4e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb6ef39c3db49f26f736d6c9221dd825409ec4e.hip deleted file mode 100644 index 43036d391d72..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcb6ef39c3db49f26f736d6c9221dd825409ec4e.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, true,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, true, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcbe827108d252b2f5847fa8e132c9c3e56a90a0.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcbe827108d252b2f5847fa8e132c9c3e56a90a0.hip deleted file mode 100644 index afbe0fc50eb6..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fcbe827108d252b2f5847fa8e132c9c3e56a90a0.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fccabea88b8e290688c1b360875d228e6fdf1624.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fccabea88b8e290688c1b360875d228e6fdf1624.hip deleted file mode 100644 index efeeb811fb30..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fccabea88b8e290688c1b360875d228e6fdf1624.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd10a3b937e9659716925e39a01d794914b08e26.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd10a3b937e9659716925e39a01d794914b08e26.hip deleted file mode 100644 index dce14072009c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd10a3b937e9659716925e39a01d794914b08e26.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 16, 32, 32, 32>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<2, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - true, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<32, ck_tile::fp16_t, true,128, 64, 16, 32, 32, 32, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, true, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd19d7614f2ed5da21a52ed172ef62cc07c9c01a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd19d7614f2ed5da21a52ed172ef62cc07c9c01a.hip deleted file mode 100644 index 56fcf8377dbe..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd19d7614f2ed5da21a52ed172ef62cc07c9c01a.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - true, - false, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::bf16_t, - true, - true, - false, - false>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd26e43ca652e6f58ff48c356165aa4349833b55.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd26e43ca652e6f58ff48c356165aa4349833b55.hip deleted file mode 100644 index c81a859cd7c0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd26e43ca652e6f58ff48c356165aa4349833b55.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd345632e0cae0d549ba79626a08b1885711deb6.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd345632e0cae0d549ba79626a08b1885711deb6.hip deleted file mode 100644 index 3dca81219aff..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd345632e0cae0d549ba79626a08b1885711deb6.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 128, 32, 256, 32, 256>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVS< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<256, ck_tile::bf16_t, false,128, 128, 32, 256, 32, 256, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::NO_BIAS, false, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd3558b4c7a667dbc365c4c2ceda646975408f51.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd3558b4c7a667dbc365c4c2ceda646975408f51.hip deleted file mode 100644 index 8d236cf911d0..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd3558b4c7a667dbc365c4c2ceda646975408f51.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd614df484b263deae3b3c20adb0ce7b62eaa651.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd614df484b263deae3b3c20adb0ce7b62eaa651.hip deleted file mode 100644 index 9de1995c234f..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd614df484b263deae3b3c20adb0ce7b62eaa651.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd9cd1305633b62b68fb8474ce021f639f8492e7.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd9cd1305633b62b68fb8474ce021f639f8492e7.hip deleted file mode 100644 index abbe42c081a3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fd9cd1305633b62b68fb8474ce021f639f8492e7.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fde12cd366d6850ce26afce98e5076b695b4875b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fde12cd366d6850ce26afce98e5076b695b4875b.hip deleted file mode 100644 index 1203456f8d9c..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fde12cd366d6850ce26afce98e5076b695b4875b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe245e9ea974adce2b9807d33b9ba12d916eaffb.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe245e9ea974adce2b9807d33b9ba12d916eaffb.hip deleted file mode 100644 index e4e3c95292d4..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe245e9ea974adce2b9807d33b9ba12d916eaffb.hip +++ /dev/null @@ -1,80 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile::sequence<128, 64, 32, 64, 32, 64>; -using fmha_warp_tile_0 = ck_tile::sequence<32, 32, 16>; - -using fmha_shape_0 = ck_tile::TileFmhaShape, - fmha_warp_tile_0, - ck_tile::sequence<4, 1, 1>, - fmha_warp_tile_0, - true>; - -using fmha_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; - -using fmha_pipeline_problem_0 = ck_tile::BlockFmhaPipelineProblem< - typename FmhaFwdTypeConfig::QDataType, - typename FmhaFwdTypeConfig::KDataType, - typename FmhaFwdTypeConfig::VDataType, - typename FmhaFwdTypeConfig::SaccDataType, - typename FmhaFwdTypeConfig::SMPLComputeDataType, - typename FmhaFwdTypeConfig::BiasDataType, - typename FmhaFwdTypeConfig::RandValOutputDataType, - typename FmhaFwdTypeConfig::LSEDataType, - typename FmhaFwdTypeConfig::PDataType, - typename FmhaFwdTypeConfig::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - fmha_shape_0, - false, - fmha_mask_0, - fmha_trait_0>; - -using fmha_pipeline_0 = ck_tile::BlockFmhaPipelineQRKSVSAsync< - fmha_pipeline_problem_0>; - -using fmha_epilogue_0 = - ck_tile::Default2DEpilogue::OaccDataType, - typename FmhaFwdTypeConfig::ODataType, - true, true>>; - -using fmha_kernel_0 = - ck_tile::FmhaFwdKernel, - fmha_pipeline_0, - fmha_epilogue_0>; - -using trait_0 = fmha_fwd_traits_<64, ck_tile::bf16_t, false,128, 64, 32, 64, 32, 64, true, - ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC, fmha_mask_0, ck_tile::BlockAttentionBiasEnum::ALIBI, true, false, false, true, true, true, true>; - -#include - -template<> -float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) -{ - using k_ = fmha_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe72cdd69944d2d765478d4aed13066a02b76f6d.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe72cdd69944d2d765478d4aed13066a02b76f6d.hip deleted file mode 100644 index 0985b165254e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe72cdd69944d2d765478d4aed13066a02b76f6d.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 64, 32, 64, 32, 32, 64, 64>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<64, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe8b8c3525fe86a20a2d6c69585f3e36c16caabd.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe8b8c3525fe86a20a2d6c69585f3e36c16caabd.hip deleted file mode 100644 index dcafc647b481..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe8b8c3525fe86a20a2d6c69585f3e36c16caabd.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - true, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - true, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe97b7adcd67ed9bda8831d1f3f1ca7590c6d251.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe97b7adcd67ed9bda8831d1f3f1ca7590c6d251.hip deleted file mode 100644 index a904086533b8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe97b7adcd67ed9bda8831d1f3f1ca7590c6d251.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - true, - false, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe9d98dbec5096a89b116f85675af772f023014a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe9d98dbec5096a89b116f85675af772f023014a.hip deleted file mode 100644 index 7cf999339a07..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fe9d98dbec5096a89b116f85675af772f023014a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_feb5e77111fe1e20bafdb83a925b5faeeb6214af.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_feb5e77111fe1e20bafdb83a925b5faeeb6214af.hip deleted file mode 100644 index 88d22e1537ef..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_feb5e77111fe1e20bafdb83a925b5faeeb6214af.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecd7501265b4c4dcf015485e63e2324304f70d3.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecd7501265b4c4dcf015485e63e2324304f70d3.hip deleted file mode 100644 index 7c1ee5f4a10d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecd7501265b4c4dcf015485e63e2324304f70d3.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecffa403b3631b1957e1a9a06f18fdb3b4eee5f.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecffa403b3631b1957e1a9a06f18fdb3b4eee5f.hip deleted file mode 100644 index 3c233e8412a8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fecffa403b3631b1957e1a9a06f18fdb3b4eee5f.hip +++ /dev/null @@ -1,65 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_bwd_dot_do_o_trait_0 = - ck_tile::TileFmhaBwdOGradDotOTraits; - -using fmha_bwd_dot_do_o_pipeline_problem_0 = ck_tile::BlockFmhaBwdOGradDotOPipelineProblem< - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, - 32, - false, - fmha_bwd_dot_do_o_trait_0>; - -using fmha_bwd_dot_do_o_0 = - typename ck_tile::BlockFmhaBwdOGradDotO; - -using fmha_bwd_dot_do_o_kernel_0 = - ck_tile::FmhaBwdOGradDotOKernel; - -using dot_do_o_trait_0 = - fmha_bwd_dot_do_o_traits_<32, ck_tile::bf16_t, false, true, true>; - -#include - -template <> -float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dot_do_o_get_name_() -{ - using k_ = fmha_bwd_dot_do_o_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff453e3bdc9752cb7b81f7cc3056325a8b9a8ad4.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff453e3bdc9752cb7b81f7cc3056325a8b9a8ad4.hip deleted file mode 100644 index 3ad7535c5405..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff453e3bdc9752cb7b81f7cc3056325a8b9a8ad4.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff6862dbdbb20bc63a650e1f93e9ac169bb702b2.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff6862dbdbb20bc63a650e1f93e9ac169bb702b2.hip deleted file mode 100644 index 658f8e0f01b3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ff6862dbdbb20bc63a650e1f93e9ac169bb702b2.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb5b7349a671b182d73c8016590f26fe06a4cba.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb5b7349a671b182d73c8016590f26fe06a4cba.hip deleted file mode 100644 index c71bd6afbbfc..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb5b7349a671b182d73c8016590f26fe06a4cba.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - true, - false, - true, - true, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb8adef0cef91a86f36872407fea35df90e8f2b.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb8adef0cef91a86f36872407fea35df90e8f2b.hip deleted file mode 100644 index 3e48b55ce5c8..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffb8adef0cef91a86f36872407fea35df90e8f2b.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::bf16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::bf16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffc6056d9fe125a4dbe08c1d86354e51f7daadd5.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffc6056d9fe125a4dbe08c1d86354e51f7daadd5.hip deleted file mode 100644 index 3d71b992a99d..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffc6056d9fe125a4dbe08c1d86354e51f7daadd5.hip +++ /dev/null @@ -1,73 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_bwd_convert_dq_trait_0 = - ck_tile::TileFmhaBwdConvertQGradTraits; - -using fmha_bwd_convert_dq_pipeline_problem_0 = - ck_tile::BlockFmhaBwdConvertQGradPipelineProblem< - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::QGradDataType, - /* BlockSize = */ 256, - 64, - 128, - 64, - false, - true, - fmha_bwd_convert_dq_trait_0>; - -using fmha_bwd_convert_dq_0 = - typename ck_tile::BlockFmhaBwdConvertQGrad; - -using fmha_bwd_convert_dq_kernel_0 = - ck_tile::FmhaBwdConvertQGradKernel; - -using convert_dq_trait_0 = fmha_bwd_convert_dq_traits_<64, - ck_tile::fp16_t, - false, - true, - false, - true>; - -#include - -template <> -float fmha_bwd_convert_dq_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_convert_dq_get_name_() -{ - using k_ = fmha_bwd_convert_dq_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffd868d49abdb769ab82c21508d655daf54b8a99.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffd868d49abdb769ab82c21508d655daf54b8a99.hip deleted file mode 100644 index c95405c962a7..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_ffd868d49abdb769ab82c21508d655daf54b8a99.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 64, 256, 16, 256, 16, 32, 256, 256>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - true>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - true>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<256, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::NO_BIAS, - false, - false, - true, - true, - true, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fff7aa57cca501f221077124359a589b3a6f9d0a.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fff7aa57cca501f221077124359a589b3a6f9d0a.hip deleted file mode 100644 index 3822d9fcbd71..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fff7aa57cca501f221077124359a589b3a6f9d0a.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<16, 128, 128, 16, 128, 16, 32, 128, 128>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<1, 4, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - true, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - false, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - false, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<128, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - false, - false, - false, - true>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fffbfcac254e33926131a71905e93f9cc0aef89e.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fffbfcac254e33926131a71905e93f9cc0aef89e.hip deleted file mode 100644 index 56f1ea131d35..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_ck_autogen_fffbfcac254e33926131a71905e93f9cc0aef89e.hip +++ /dev/null @@ -1,138 +0,0 @@ -// ========================================== -// THIS CODE IS AUTOGENERATED. DO NOT MODIFY. -// @generated -// ========================================== -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -// auto generated by generate.py -#include - -using fmha_dtype_0 = ck_tile::fp16_t; - -using fmha_block_tile_0 = ck_tile:: - sequence<32, 128, 32, 32, 32, 32, 64, 32, 32>; -using fmha_block_warps0_0 = ck_tile::sequence<1, 4, 1>; -using fmha_block_warps1_0 = ck_tile::sequence<4, 1, 1>; -using fmha_block_warps2_0 = ck_tile::sequence<2, 2, 1>; -using fmha_warp_tile0_0 = ck_tile::sequence<16, 16, 32>; -using fmha_warp_tile1_0 = ck_tile::sequence<16, 16, 16>; - -// TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape -// G0&G2 -> GSdP -// G1&G3 -> GdKV -// G4 -> GdQ -using fmha_bwd_shape_0 = ck_tile::TileFmhaBwdShape; - -using fmha_bwd_trait_0 = ck_tile::TileFmhaTraits; -using fmha_mask_0 = ck_tile::SimplifiedGenericAttentionMask; -using fmha_dropout_0 = ck_tile::BlockDropoutBwd; - -using fmha_bwd_pipeline_problem_0 = ck_tile::BlockFmhaBwdPipelineProblem< - typename FmhaBwdTypeConfig::QDataType, - typename FmhaBwdTypeConfig::KDataType, - typename FmhaBwdTypeConfig::VDataType, - typename FmhaBwdTypeConfig::GemmDataType, - typename FmhaBwdTypeConfig::LSEDataType, - typename FmhaBwdTypeConfig::AccDataType, - typename FmhaBwdTypeConfig::DDataType, - typename FmhaBwdTypeConfig::BiasDataType, - typename FmhaBwdTypeConfig::RandValOutputDataType, - typename FmhaBwdTypeConfig::ODataType, - typename FmhaBwdTypeConfig::OGradDataType, - typename FmhaBwdTypeConfig::QGradDataType, - typename FmhaBwdTypeConfig::KGradDataType, - typename FmhaBwdTypeConfig::VGradDataType, - typename FmhaBwdTypeConfig::BiasGradDataType, - fmha_bwd_shape_0, - false, - false, - fmha_mask_0, - fmha_dropout_0, - fmha_bwd_trait_0>; - -using fmha_bwd_pipeline_0 = ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP; - -using fmha_bwd_dk_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::KGradDataType, - true, - false>>; - -using fmha_bwd_dv_epilogue_0 = ck_tile::Default2DEpilogue< - ck_tile::Default2DEpilogueProblem::AccDataType, - typename FmhaBwdTypeConfig::VGradDataType, - true, - false>>; - -using fmha_bwd_dq_dk_dv_kernel_0 = - ck_tile::FmhaBwdDQDKDVKernel; - -using dq_dk_dv_trait_0 = fmha_bwd_dq_dk_dv_traits_<32, - ck_tile::fp16_t, - false, - ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP, - fmha_mask_0, - fmha_dropout_0, - ck_tile::BlockAttentionBiasEnum::ALIBI, - false, - false, - true, - false, - false, - false>; - -#include - -template <> -float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s, fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - if(s.log_level_ > 0) - std::cout << ", " << k_::GetName() << std::flush; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel( - s, ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)); -} - -template <> -void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config& s, - fmha_bwd_args a) -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); - constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel_pt(k_{}, grids, blocks, 0, kargs)( - ck_tile::stream_config{s.stream_id_}); -} - -template <> -std::string fmha_bwd_dq_dk_dv_get_name_() -{ - using k_ = fmha_bwd_dq_dk_dv_kernel_0; - return k_::GetName(); -} From 0d275281ba12d23fc07987d7a9a5206bf3383b7e Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 14 Jan 2025 20:09:29 +0000 Subject: [PATCH 03/46] bring new fmha_bwd/fwd.hpp files --- .../hip/flash_attn/ck/fmha_bwd.hpp | 12 +- .../hip/flash_attn/ck/fmha_fwd.hpp | 236 +++++++++++------- 2 files changed, 153 insertions(+), 95 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp index 002b99c9cbf5..38ec2ef20c5c 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp @@ -15,11 +15,19 @@ #include #include +struct FmhaBwdFp16 +{ +}; + +struct FmhaBwdBf16 +{ +}; + template struct FmhaBwdTypeConfig; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -39,7 +47,7 @@ struct FmhaBwdTypeConfig }; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp index d4ccb668c602..2de70cd49bbb 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd.hpp @@ -17,11 +17,35 @@ #include #include +struct FmhaFwdFp16 +{ +}; + +struct FmhaFwdBf16 +{ +}; + +struct FmhaFwdFp8 +{ +}; + +struct FmhaFwdBf8 +{ +}; + +struct FmhaFwdFp8Fp16 +{ +}; + +struct FmhaFwdFp8Bf16 +{ +}; + template struct FmhaFwdTypeConfig; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -37,7 +61,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; @@ -53,7 +77,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::fp8_t; using KDataType = ck_tile::fp8_t; @@ -69,7 +93,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf8_t; using KDataType = ck_tile::bf8_t; @@ -166,6 +190,8 @@ struct fmha_fwd_splitkv_args void* block_table_ptr; ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr + bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not + // nullptr. const void* cache_batch_idx; @@ -174,9 +200,21 @@ struct fmha_fwd_splitkv_args // seqlen_k = kargs.seqlen_k // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] - // kvcache mode (use same kernel as batch mode): + // or kargs.seqlen_k_ptr[b] + // + // batch mode (kvcache): // seqlen_q = kargs.seqlen_q + // seqlen_k = kargs.seqlen_k_ptr[b] + // group mode (kvcache): + // seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b] + // + // when is_gappy=true: + // seqlen_k = kargs.seqlen_k_ptr[b] + // seqstart_k_ptr[b] now store local offset of each batch + // + // when is_gappy=false: // seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b] + // or kargs.seqlen_k_ptr[b] const void* seqstart_q_ptr; const void* seqstart_k_ptr; const void* seqlen_k_ptr; @@ -252,7 +290,7 @@ struct fmha_fwd_appendkv_args ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr - const void* cache_batch_idx; + const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache) ck_tile::index_t stride_q; ck_tile::index_t stride_k; @@ -280,91 +318,101 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args) if constexpr(FmhaKernel::kIsGroupMode) { return FmhaKernel::MakeKargsImpl(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.rand_val_ptr, - args.lse_ptr, - args.o_ptr, - args.seqstart_q_ptr, - args.seqstart_k_ptr, - args.seqlen_k_ptr, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale_s, - args.scale_p, - args.scale_o, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_o, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_lse, - args.nhead_stride_o, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.s_randval, - args.drop_seed_offset); + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.rand_val_ptr, + args.lse_ptr, + args.o_ptr, + args.seqstart_q_ptr, + args.seqstart_k_ptr, + args.seqlen_k_ptr, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale_s, + args.scale_p, + args.scale_o, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_o, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_lse, + args.nhead_stride_o, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.s_randval, + args.drop_seed_offset); } else { // create batch mode kernel arguments return FmhaKernel::MakeKargsImpl(args.q_ptr, - args.k_ptr, - args.v_ptr, - args.bias_ptr, - args.rand_val_ptr, - args.lse_ptr, - args.o_ptr, - args.seqlen_q, - args.seqlen_k, - args.hdim_q, - args.hdim_v, - args.nhead_q, - args.nhead_q / args.nhead_k, - args.scale_s, - args.scale_p, - args.scale_o, - args.stride_q, - args.stride_k, - args.stride_v, - args.stride_bias, - args.stride_randval, - args.stride_o, - args.nhead_stride_q, - args.nhead_stride_k, - args.nhead_stride_v, - args.nhead_stride_bias, - args.nhead_stride_randval, - args.nhead_stride_lse, - args.nhead_stride_o, - args.batch_stride_q, - args.batch_stride_k, - args.batch_stride_v, - args.batch_stride_bias, - args.batch_stride_randval, - args.batch_stride_lse, - args.batch_stride_o, - args.window_size_left, - args.window_size_right, - args.mask_type, - args.p_drop, - args.s_randval, - args.drop_seed_offset); + args.k_ptr, + args.v_ptr, + args.bias_ptr, + args.rand_val_ptr, + args.lse_ptr, + args.o_ptr, + args.seqlen_q, + args.seqlen_k, + args.hdim_q, + args.hdim_v, + args.nhead_q, + args.nhead_q / args.nhead_k, + args.scale_s, + args.scale_p, + args.scale_o, + args.stride_q, + args.stride_k, + args.stride_v, + args.stride_bias, + args.stride_randval, + args.stride_o, + args.nhead_stride_q, + args.nhead_stride_k, + args.nhead_stride_v, + args.nhead_stride_bias, + args.nhead_stride_randval, + args.nhead_stride_lse, + args.nhead_stride_o, + args.batch_stride_q, + args.batch_stride_k, + args.batch_stride_v, + args.batch_stride_bias, + args.batch_stride_randval, + args.batch_stride_lse, + args.batch_stride_o, + args.window_size_left, + args.window_size_right, + args.mask_type, + args.p_drop, + args.s_randval, + args.drop_seed_offset); } }(); - dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v); - return ck_tile::make_tuple(kargs, grids); + if constexpr(FmhaKernel::kIsGroupMode) + { + dim3 grids = FmhaKernel::GridSize( + args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr); + return ck_tile::make_tuple(kargs, grids); + } + else + { + dim3 grids = + FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false); + return ck_tile::make_tuple(kargs, grids); + } } template @@ -375,7 +423,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) // create group mode kernel arguments if constexpr(Kernel::kIsGroupMode) { - return Kernel::MakeKargsImpl(args.q_ptr, + return Kernel::MakeKargs(args.q_ptr, args.k_ptr, args.v_ptr, args.bias_ptr, @@ -390,6 +438,10 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) args.nhead_q, args.nhead_q / args.nhead_k, args.num_splits, + args.block_table_ptr, + args.batch_stride_block_table, + args.page_block_size, + args.is_gappy, args.scale_s, args.scale_p, args.stride_q, @@ -413,7 +465,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) } else { // create batch mode kernel arguments - return Kernel::MakeKargsImpl(args.q_ptr, + return Kernel::MakeKargs(args.q_ptr, args.k_ptr, args.v_ptr, args.bias_ptr, @@ -459,8 +511,8 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args) } }(); - dim3 grids = - Kernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.num_splits); + dim3 grids = Kernel::GridSize( + args.batch, args.nhead_q, args.nhead_k, args.max_seqlen_q, args.hdim_v, args.num_splits); return ck_tile::make_tuple(kargs, grids); } @@ -473,7 +525,7 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args) // create group mode kernel argumentszs if constexpr(Kernel::kIsGroupMode) { - return Kernel::MakeKargsImpl(args.lse_acc_ptr, + return Kernel::MakeKargs(args.lse_acc_ptr, args.o_acc_ptr, args.lse_ptr, args.o_ptr, @@ -493,7 +545,7 @@ auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args) } else { // create batch mode kernel arguments - return Kernel::MakeKargsImpl(args.lse_acc_ptr, + return Kernel::MakeKargs(args.lse_acc_ptr, args.o_acc_ptr, args.lse_ptr, args.o_ptr, @@ -526,7 +578,7 @@ template auto fmha_fwd_appendkv_create_kargs_and_grids(fmha_fwd_appendkv_args args) { assert(args.nhead_q % args.nhead_k == 0); - auto kargs = Kernel::MakeKargsImpl(args.q_ptr, + auto kargs = Kernel::MakeKargs(args.q_ptr, args.k_ptr, args.knew_ptr, args.v_ptr, @@ -668,7 +720,6 @@ std::string fmha_fwd_splitkv_get_name_(); template ; static constexpr bool kIsGroupMode = kIsGroupMode_; - static constexpr ck_tile::index_t kM0 = kM0_; static constexpr ck_tile::index_t kN1 = kN1_; static constexpr bool kStoreLse = kStoreLse_; static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_; From cf1e5bece498d005662de7afde5ba1483fb9f38c Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Mon, 10 Feb 2025 16:22:12 +0000 Subject: [PATCH 04/46] add_subdirectory to ATen/CMakeLists.txt --- aten/src/ATen/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index ab95de5036bd..874b45688d50 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -183,6 +183,8 @@ if(USE_FLASH_ATTENTION) endif() endif() message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled") + message(STATUS "Generating CK kernel instances...") + add_subdirectory(native/transformers/hip/flash_attn/ck) file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip") list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip}) endif() From 92c437ec09944230f50d09998b845335c62ae2ca Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Mon, 10 Feb 2025 17:00:55 +0000 Subject: [PATCH 05/46] Add script for swapping make_kernel with make_kernel_pt --- .../hip/flash_attn/ck/add_make_kernel_pt.sh | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh b/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh new file mode 100755 index 000000000000..80515021ba12 --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Check if the input file is provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the input file to a variable +file_list=$1 + +# Check if the file exists +if [ ! -f "$file_list" ]; then + echo "Error: File '$file_list' not found!" + exit 1 +fi + +# Loop through each line in the file list +while IFS= read -r file; do + # Check if the file exists in the current directory + if [ -f "$file" ]; then + # Use sed to replace "make_kernel" with "make_kernel_pt" in place + sed -i 's/make_kernel/make_kernel_pt/g' "$file" + echo "Updated: $file" + else + echo "Skipping: $file (not found)" + fi +done < "$file_list" + +echo "Replacement completed." + From c884aa67d0ae05480045d1d088f4ca4acb47778d Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Mon, 10 Feb 2025 18:13:16 +0000 Subject: [PATCH 06/46] Create ck flash attention CMakeLists.txt file --- .../hip/flash_attn/ck/CMakeLists.txt | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt b/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt new file mode 100644 index 000000000000..2f2b39086ca6 --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt @@ -0,0 +1,63 @@ +# generate a list of kernels, but not actually emit files at config stage +execute_process( + COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py + --api fwd --receipt 2 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt + RESULT_VARIABLE ret +) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of FWD kernels via Python.") +endif() + +execute_process( + COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py + --api bwd --receipt 2 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt + RESULT_VARIABLE ret +) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of BWD kernels via Python.") +endif() + +# Generate the files for both fwd and bwd +execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 2 --output_dir ${CMAKE_CURRENT_LIST_DIR} +) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to generate FWD kernels.") +endif() + +execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 2 --output_dir ${CMAKE_CURRENT_LIST_DIR} + RESULT_VARIABLE ret +) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to generate BWD kernels.") +endif() + +# Change make_kernel to make_kernel_pt for fwd +execute_process( + COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt" + RESULT_VARIABLE ret) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the fwd pass") +endif() + +# Change make_kernel to make_kernel_pt for bwd +execute_process( + COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt" + RESULT_VARIABLE ret) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the bwd pass") +endif() + +# Change file extensions to .hip +execute_process(COMMAND bash -c "for file in ${CMAKE_CURRENT_LIST_DIR}/*.cpp; do mv -- \"$file\" \"\${file%.cpp}.hip\"; done" + RESULT_VARIABLE ret +) + +if(ret AND NOT ret EQUAL 0) + message( FATAL_ERROR "CK Tile FMHA FAILED to change the generated instances extensions from .cpp to .hpp") +endif() From e3ab46d7f33d1854998506746904efeb3776e8ca Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 11 Feb 2025 19:32:51 +0000 Subject: [PATCH 07/46] Update CK to pick up receipt 4 --- third_party/composable_kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/composable_kernel b/third_party/composable_kernel index c0b90f130f4c..8086bbe3a78d 160000 --- a/third_party/composable_kernel +++ b/third_party/composable_kernel @@ -1 +1 @@ -Subproject commit c0b90f130f4cad7f1e7fc97c4d58d4798ecc2d47 +Subproject commit 8086bbe3a78d931eb96fe12fdc014082e18d18d3 From 22940d31b687e2756d4ff301514805eef2176447 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 11 Feb 2025 18:02:53 +0000 Subject: [PATCH 08/46] Update to receipt 4 --- .../native/transformers/hip/flash_attn/ck/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt b/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt index 2f2b39086ca6..a72911cd510e 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/CMakeLists.txt @@ -1,7 +1,7 @@ # generate a list of kernels, but not actually emit files at config stage execute_process( COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py - --api fwd --receipt 2 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt + --api fwd --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt RESULT_VARIABLE ret ) @@ -11,7 +11,7 @@ endif() execute_process( COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py - --api bwd --receipt 2 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt + --api bwd --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt RESULT_VARIABLE ret ) @@ -20,14 +20,14 @@ if(ret AND NOT ret EQUAL 0) endif() # Generate the files for both fwd and bwd -execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 2 --output_dir ${CMAKE_CURRENT_LIST_DIR} +execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR} ) if(ret AND NOT ret EQUAL 0) message( FATAL_ERROR "CK Tile FMHA FAILED to generate FWD kernels.") endif() -execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 2 --output_dir ${CMAKE_CURRENT_LIST_DIR} +execute_process(COMMAND python3 ${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR} RESULT_VARIABLE ret ) From 1b3ac115d7f4d38bdfa4f19436471befd558d0ce Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 11 Feb 2025 19:25:15 +0000 Subject: [PATCH 09/46] Add warning if CK is requested when unsupported --- aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index ca13a771bb19..10bfe248a76f 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -488,6 +488,10 @@ inline std::tuple mha_bwd( philox_offset); } #else + if(at::globalContext().getROCmFAPreferredBackend() == + at::ROCmFABackend::Ck) { + TORCH_WARN_ONCE("Warning! You have opted to use CK flash attention backend in a build that was not compiled using USE_CK_FLASH_ATTENTION=1. Please set this variable and try again. Defaulting to use aotriton backend..."); + } return mha_bwd_aot( dout, q, From da17d67e828d48914c78aa748ed345e5fe9aab40 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 11 Feb 2025 22:19:23 +0000 Subject: [PATCH 10/46] Add generated files to .gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 8d4ceaa811c0..54129af906fb 100644 --- a/.gitignore +++ b/.gitignore @@ -124,6 +124,13 @@ torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h torch/version.py minifier_launcher.py +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_convert* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fwd_blob* +aten/src/ATen/native/transformers/hip/flash_attn/ck/bwd_blob* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_api* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_api* # Root level file used in CI to specify certain env configs. # E.g., see .circleci/config.yaml env From 4cc20dac7cec4d686f08c1709268e7bdaa561fcd Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 12 Feb 2025 01:32:59 +0000 Subject: [PATCH 11/46] lint --- .../native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh b/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh index 80515021ba12..672bea143751 100755 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/add_make_kernel_pt.sh @@ -28,4 +28,3 @@ while IFS= read -r file; do done < "$file_list" echo "Replacement completed." - From 18982007977f7e9852ccecfdaf24afe2a3ff07bc Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 28 Jan 2025 22:23:57 +0000 Subject: [PATCH 12/46] Initial plumbing for mem_eff path --- .../native/transformers/cuda/attention.cu | 171 +++++++++--------- .../hip/flash_attn/aot/mha_all_aot.hip | 2 +- 2 files changed, 89 insertions(+), 84 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 7fe7ee7a1ba1..a30fa9408269 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1128,97 +1128,102 @@ std::tuple _efficient_ #ifdef USE_ROCM // ROCM Implementation - auto ret = aotriton::v2::flash::check_gpu(stream); - if (hipSuccess != ret) { - TORCH_CHECK(false, + if(at::globalContext().getROCmFAPreferredBackend() == + at::ROCmFABackend::Ck) { + //forward_attention_ck(...); + } else { // use aotriton + auto ret = aotriton::v2::flash::check_gpu(stream); + if (hipSuccess != ret) { + TORCH_CHECK(false, "[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs" " (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)") - } + } - // AOTriton may accept aligned on logsumexp tensor in the future for better - // performance, but for now it requires compact logsumexp tensor, even if - // compute_logsumexp is false - constexpr int kAlignLSE = 1; - res = at::empty({B, M, num_heads, Kv}, query.options()); - logsumexp = at::empty( + // AOTriton may accept aligned on logsumexp tensor in the future for better + // performance, but for now it requires compact logsumexp tensor, even if + // compute_logsumexp is false + constexpr int kAlignLSE = 1; + res = at::empty({B, M, num_heads, Kv}, query.options()); + logsumexp = at::empty( { B, num_heads, max_seqlen_q }, query.options().dtype(at::ScalarType::Float)); - at::Tensor softmax_lse = logsumexp.view({B * num_heads, max_seqlen_q}); - at::Tensor q_t = query.transpose(1, 2); - at::Tensor k_t = key.transpose(1, 2); - at::Tensor v_t = value.transpose(1, 2); - at::Tensor output_t = res.transpose(1, 2); - bool is_causal; - if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { - is_causal = true; - } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { - is_causal = false; - } else { - TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); - } - - const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + at::Tensor softmax_lse = logsumexp.view({B * num_heads, max_seqlen_q}); + at::Tensor q_t = query.transpose(1, 2); + at::Tensor k_t = key.transpose(1, 2); + at::Tensor v_t = value.transpose(1, 2); + at::Tensor output_t = res.transpose(1, 2); + bool is_causal; + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { + is_causal = true; + } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { + is_causal = false; + } else { + TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + } - using aotriton::v2::flash::attn_fwd; - using aotriton::v2::flash::attn_fwd_compact_varlen; - using sdp::aotriton_adapter::mk_aotensor; - using sdp::aotriton_adapter::mk_aoscalartensor; - using sdp::aotriton_adapter::mk_philoxtensor; - aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, aotriton::DType::kFloat16); - at::Tensor softmax_fa_t = at::empty({ 0, 0, 0, 0 }, query.options()); - const bool use_philox_state = in_capture_stream; - auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t); - auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t); - auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0; - auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr()) : mk_philoxtensor(nullptr); - auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr()) : mk_philoxtensor(nullptr); - hipError_t err; // TODO: Error handling - if (seqstart_q.has_value()) { - // varlen aka nested tensor - err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"), - mk_aotensor(k_t, "k"), - mk_aotensor(v_t, "v"), - mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q"), - mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k"), - max_seqlen_q, - max_seqlen_k, - bias.has_value() ? mk_aotensor(bias.value(), "bias"): empty_t4, - softmax_scale, - mk_aotensor<2>(softmax_lse, "M"), - mk_aotensor(output_t, "Out"), - dropout_p, - seed, - offset1, - offset2, - seed_output, - offset_output, - mk_aotensor(softmax_fa_t, "encoded_softmax"), - is_causal, - stream); - } else { - err = attn_fwd(mk_aotensor(q_t, "q"), - mk_aotensor(k_t, "k"), - mk_aotensor(v_t, "v"), - bias.has_value() ? mk_aotensor(bias.value(), "bias"): empty_t4, - softmax_scale, - mk_aotensor<2>(softmax_lse, "M"), - mk_aotensor(output_t, "Out"), - dropout_p, - seed, - offset1, - offset2, - seed_output, - offset_output, - mk_aotensor(softmax_fa_t, "encoded_softmax"), - is_causal, - stream); - } - if (!compute_logsumexp) { - // Set the tensor to empty when compute_logsumexp is false - logsumexp = at::empty( + const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + + using aotriton::v2::flash::attn_fwd; + using aotriton::v2::flash::attn_fwd_compact_varlen; + using sdp::aotriton_adapter::mk_aotensor; + using sdp::aotriton_adapter::mk_aoscalartensor; + using sdp::aotriton_adapter::mk_philoxtensor; + aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, aotriton::DType::kFloat16); + at::Tensor softmax_fa_t = at::empty({ 0, 0, 0, 0 }, query.options()); + const bool use_philox_state = in_capture_stream; + auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t); + auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t); + auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0; + auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr()) : mk_philoxtensor(nullptr); + auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr()) : mk_philoxtensor(nullptr); + hipError_t err; // TODO: Error handling + if (seqstart_q.has_value()) { + // varlen aka nested tensor + err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"), + mk_aotensor(k_t, "k"), + mk_aotensor(v_t, "v"), + mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q"), + mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k"), + max_seqlen_q, + max_seqlen_k, + bias.has_value() ? mk_aotensor(bias.value(), "bias"): empty_t4, + softmax_scale, + mk_aotensor<2>(softmax_lse, "M"), + mk_aotensor(output_t, "Out"), + dropout_p, + seed, + offset1, + offset2, + seed_output, + offset_output, + mk_aotensor(softmax_fa_t, "encoded_softmax"), + is_causal, + stream); + } else { + err = attn_fwd(mk_aotensor(q_t, "q"), + mk_aotensor(k_t, "k"), + mk_aotensor(v_t, "v"), + bias.has_value() ? mk_aotensor(bias.value(), "bias"): empty_t4, + softmax_scale, + mk_aotensor<2>(softmax_lse, "M"), + mk_aotensor(output_t, "Out"), + dropout_p, + seed, + offset1, + offset2, + seed_output, + offset_output, + mk_aotensor(softmax_fa_t, "encoded_softmax"), + is_causal, + stream); + } + if (!compute_logsumexp) { + // Set the tensor to empty when compute_logsumexp is false + logsumexp = at::empty( { B * num_heads, max_seqlen_q, 0 }, query.options().dtype(at::ScalarType::Float)); - } + } + } // CK BACKEND #else // CUDA Implementation cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index()); diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip index 65425f9c960d..12ed3e6e1f36 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip @@ -742,4 +742,4 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size } } // namespace pytorch_flash -#endif +#endif // USE_FLASH_ATTENTION From 78f9f55a4fda0de713f11f29b1d30983560ff38f Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 29 Jan 2025 22:26:41 +0000 Subject: [PATCH 13/46] Began writing function signature --- .../native/transformers/cuda/attention.cu | 1 + .../transformers/hip/flash_attn/flash_api.h | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index a30fa9408269..ee6a66f7c074 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1131,6 +1131,7 @@ std::tuple _efficient_ if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { //forward_attention_ck(...); + std::cout << "In my branch" << std::endl; } else { // use aotriton auto ret = aotriton::v2::flash::check_gpu(stream); if (hipSuccess != ret) { diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index 10bfe248a76f..670666d93f40 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -625,4 +625,32 @@ inline std::tuple mha_varlen_bwd #endif } +std::tuple< + at::Tensor, // output + at::Tensor, // q + at::Tensor, // k + at::Tensor, // v + at::Tensor, // lse + at::Tensor, // seed + at::Tensor, // offset + at::Tensor> // dropout randval +mem_eff_forward_ck( + const at::Tensor& q, + const at::Tensor& k, + const at::Tensor& v, + const float p_dropout, + const float softmax_scale, + const bool return_dropout_randval, + const std::optional is_causal, + const std::optional scale, + const std::optional& attn_bias, + const std::optional& out_, + const std::optional& seqstart_q, + const std::optional& seqstart_k, + std::optional gen_ +); + + + + } // namespace pytorch_flash From 03d5fda4347ba63ecca38f2cb48c2987d3dd6380 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 30 Jan 2025 22:46:47 +0000 Subject: [PATCH 14/46] parameters aligned (pre-compile) --- .../native/transformers/cuda/attention.cu | 16 ++++ .../hip/flash_attn/ck/me_fwd_ck.hip | 73 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index ee6a66f7c074..875a2487225b 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -848,6 +848,12 @@ std::tuple _scaled_dot_product_efficient_attenti std::optional scale) { // Used for tracking usage statistics C10_LOG_API_USAGE_ONCE("torch.sdpa.mem_efficient_attention"); + std::cout << std::endl; + std::cout << "what we want vvvvvvvvvvvvvv" << std::endl; + std::cout << "MAX_SEQLEN_Q: " << b4_max_seqlen_batch_q << std::endl; + std::cout << "MAX_SEQLEN_K: " << b4_max_seqlen_batch_k << std::endl; + std::cout << "MAX_SEQLEN_V: " << b4_max_seqlen_batch_v << std::endl; + std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl; // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head) // Key -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head) // Value -> Value(Batch x KV_seq_len x Num_heads x Dim_per_head) @@ -1026,6 +1032,16 @@ std::tuple _efficient_ // TODO In theory it is possible to compile with _CUDA_ARCH < 5.0 and run on a // machine that is >= 5.0. In practice, this is not a problem but since // this would avoid runtime architecture checks, we should look into it + const int64_t new_max_seqlen_batch_q = query.size(1); + const int64_t new_max_seqlen_batch_k = key.size(1); + const int64_t new_max_seqlen_batch_v = value.size(1); + std::cout << std::endl; + std::cout << "MEMORY_EFFICIENT VVVVVVVVVV" << std::endl; + std::cout << "MAX_SEQLEN_Q: " << new_max_seqlen_batch_q << std::endl; + std::cout << "MAX_SEQLEN_K: " << new_max_seqlen_batch_k << std::endl; + std::cout << "MAX_SEQLEN_V: " << new_max_seqlen_batch_v << std::endl; + std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl; + TORCH_CHECK(query.dim() == 4); TORCH_CHECK(query.dim() == 4); TORCH_CHECK(key.dim() == 4); diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip new file mode 100644 index 000000000000..85187a85a5e0 --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -0,0 +1,73 @@ +#include + + +namespace pytorch_flash { + + +std::tuple< + at::Tensor, // output + at::Tensor, // q + at::Tensor, // k + at::Tensor, // v + at::Tensor, // lse + at::Tensor, // seed + at::Tensor, // offset + at::Tensor> // dropout randval +mem_eff_forward_ck( + const at::Tensor& q, + const at::Tensor& k, + const at::Tensor& v, + const float p_dropout, + const float softmax_scale, + const bool return_dropout_randval, + const std::optional is_causal, + const std::optional scale, + const std::optional& attn_bias, + const std::optional& out_, + const std::optional& seqstart_q, + const std::optional& seqstart_k, + std::optional gen_) { + + // need to pass attn_bias to both of these + if(!seqstart_q.has_value()){ + return mha_fwd_ck( + q, // q + k, // k + v, // v + out_, // opt(out_) + std::nullopt, // opt(alibi_slopes) + p_dropout, // p_dropout + scale, // opt(softmax_scale) + is_causal, // opt(is_causal) + std::nullopt, // window_size_left + std::nullopt, // window_size_right + false, // return_softmax/return_debug_mask + gen_); // gen + } else { + // max sequence lengths are now at T.size(1) since q,k,v were all transposed + // in _scaled_dot_product_efficient_attention_cuda + const int64_t max_seqlen_q = q.size(1); + const int64_t max_seqlen_k = k.size(1); + return mha_varlen_fwd_ck( + q, // q + k, // k + v, // v + out_, // opt(out) + std::nullopt, // cu_seqlens_q + std::nullopt, // cu_seqlens_k + std::nullopt, // opt(seqused_k) + std::nullopt, // opt(alibi_slopes) + max_seqlen_q, // max_seqlen_q + max_seqlen_k, // max_seqlen_k + p_dropout // p_dropout + scale, // softmax_scale + false, // zero_tensors + is_causal, // is_causal + std::nullopt, // window_size_left + std::nullopt, // window_size_right + false, // return_softmax/return_debug_mask + gen_);// gen + } +} + +} // namespace pytorch_flash From 5fd4a309a8d2b2d05191f38a674378fe9f61fd78 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Mon, 3 Feb 2025 21:25:06 +0000 Subject: [PATCH 15/46] parameters aligned (post-compile) --- .../native/transformers/cuda/attention.cu | 12 ++--- .../hip/flash_attn/ck/me_fwd_ck.hip | 49 ++++++++++++------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 875a2487225b..a7f9648a7d13 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -848,12 +848,12 @@ std::tuple _scaled_dot_product_efficient_attenti std::optional scale) { // Used for tracking usage statistics C10_LOG_API_USAGE_ONCE("torch.sdpa.mem_efficient_attention"); - std::cout << std::endl; - std::cout << "what we want vvvvvvvvvvvvvv" << std::endl; - std::cout << "MAX_SEQLEN_Q: " << b4_max_seqlen_batch_q << std::endl; - std::cout << "MAX_SEQLEN_K: " << b4_max_seqlen_batch_k << std::endl; - std::cout << "MAX_SEQLEN_V: " << b4_max_seqlen_batch_v << std::endl; - std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl; + //std::cout << std::endl; + //std::cout << "what we want vvvvvvvvvvvvvv" << std::endl; + //std::cout << "MAX_SEQLEN_Q: " << b4_max_seqlen_batch_q << std::endl; + //std::cout << "MAX_SEQLEN_K: " << b4_max_seqlen_batch_k << std::endl; + //std::cout << "MAX_SEQLEN_V: " << b4_max_seqlen_batch_v << std::endl; + //std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl; // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head) // Key -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head) // Value -> Value(Batch x KV_seq_len x Num_heads x Dim_per_head) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index 85187a85a5e0..e9d64a3c017e 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -21,26 +21,40 @@ mem_eff_forward_ck( const float softmax_scale, const bool return_dropout_randval, const std::optional is_causal, - const std::optional scale, + const std::optional scale, const std::optional& attn_bias, - const std::optional& out_, + std::optional& out_, + const std::optional& cu_seqlens_q, + const std::optional& cu_seqlens_k, const std::optional& seqstart_q, const std::optional& seqstart_k, + std::optional& seqused_k_, + std::optional & alibi_slopes_, std::optional gen_) { + // These normally get passed in as std::nullopt so just set to -1 + // Note: See attention.cu ~line 928 and line 729 + const int non_null_window_left = -1; + const int non_null_window_right = -1; + + TORCH_CHECK( + cu_seqlens_q.has_value() == cu_seqlens_k.has_value(), + "cu_seqlens_q and cu_seqlens_k must be both set or both not set"); + + // need to pass attn_bias to both of these - if(!seqstart_q.has_value()){ + if(!cu_seqlens_q.has_value()){ return mha_fwd_ck( q, // q k, // k v, // v out_, // opt(out_) - std::nullopt, // opt(alibi_slopes) + alibi_slopes_, // opt(alibi_slopes) p_dropout, // p_dropout - scale, // opt(softmax_scale) - is_causal, // opt(is_causal) - std::nullopt, // window_size_left - std::nullopt, // window_size_right + scale.value(), // opt(softmax_scale) + is_causal.value(), // opt(is_causal) + non_null_window_left, // window_size_left + non_null_window_right, // window_size_right false, // return_softmax/return_debug_mask gen_); // gen } else { @@ -48,23 +62,24 @@ mem_eff_forward_ck( // in _scaled_dot_product_efficient_attention_cuda const int64_t max_seqlen_q = q.size(1); const int64_t max_seqlen_k = k.size(1); + return mha_varlen_fwd_ck( q, // q k, // k v, // v out_, // opt(out) - std::nullopt, // cu_seqlens_q - std::nullopt, // cu_seqlens_k - std::nullopt, // opt(seqused_k) - std::nullopt, // opt(alibi_slopes) + cu_seqlens_q.value(), // cu_seqlens_q + cu_seqlens_k.value(), // cu_seqlens_k + seqused_k_, // opt(seqused_k) + alibi_slopes_, // opt(alibi_slopes) max_seqlen_q, // max_seqlen_q max_seqlen_k, // max_seqlen_k - p_dropout // p_dropout - scale, // softmax_scale + p_dropout, // p_dropout + scale.value(),// softmax_scale false, // zero_tensors - is_causal, // is_causal - std::nullopt, // window_size_left - std::nullopt, // window_size_right + is_causal.value(), // is_causal + non_null_window_left, // window_size_left + non_null_window_right, // window_size_right false, // return_softmax/return_debug_mask gen_);// gen } From c9752e936d7239ad5d3c6be9cfe70f7861c0cb95 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 4 Feb 2025 21:11:21 +0000 Subject: [PATCH 16/46] Called my new func (pre-compile) --- .../native/transformers/cuda/attention.cu | 30 ++++++++++++++++++- .../hip/flash_attn/ck/me_fwd_ck.hip | 6 ++-- .../transformers/hip/flash_attn/flash_api.h | 5 ++-- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index a7f9648a7d13..04107cd715bb 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1147,7 +1147,35 @@ std::tuple _efficient_ if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { //forward_attention_ck(...); - std::cout << "In my branch" << std::endl; + std::cout << "In my branch" << std::endl; + auto + [out_, + q, + k, + v, + softmax_lse, + seed_t, + offset_t, + p] = + pytorch_flash::mem_eff_forward_ck( + query, + key, + value, + dropout_p, + false, // return dropout_randval + scale, + custom_mask_type == 0 ? false : true, // is_causal + bias, + res, + std::nullopt, // cu_seqlens_q: sending in nothing since CKFA works this way + std::nullopt, // cu_seqlens_k + seqstart_q, + seqstart_k, + gen_); + // not passing in optional seqused_k_ + // not passing in optional alibi_slopes_ + + } else { // use aotriton auto ret = aotriton::v2::flash::check_gpu(stream); if (hipSuccess != ret) { diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index e9d64a3c017e..c36e2210816e 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -18,7 +18,6 @@ mem_eff_forward_ck( const at::Tensor& k, const at::Tensor& v, const float p_dropout, - const float softmax_scale, const bool return_dropout_randval, const std::optional is_causal, const std::optional scale, @@ -28,9 +27,10 @@ mem_eff_forward_ck( const std::optional& cu_seqlens_k, const std::optional& seqstart_q, const std::optional& seqstart_k, + std::optional gen_, std::optional& seqused_k_, - std::optional & alibi_slopes_, - std::optional gen_) { + std::optional & alibi_slopes_) { + // These normally get passed in as std::nullopt so just set to -1 // Note: See attention.cu ~line 928 and line 729 diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index 670666d93f40..565a2b2ff8cf 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -639,12 +639,13 @@ mem_eff_forward_ck( const at::Tensor& k, const at::Tensor& v, const float p_dropout, - const float softmax_scale, const bool return_dropout_randval, const std::optional is_causal, const std::optional scale, const std::optional& attn_bias, - const std::optional& out_, + std::optional& out_, + const std::optional& cu_seqlens_q, + const std::optional& cu_seqlens_k, const std::optional& seqstart_q, const std::optional& seqstart_k, std::optional gen_ From 8b3256f372aa892787fb09a118e3757dc6d60996 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 4 Feb 2025 21:52:02 +0000 Subject: [PATCH 17/46] call my func (post-compile) --- aten/src/ATen/native/transformers/cuda/attention.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 04107cd715bb..989442696935 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1148,6 +1148,7 @@ std::tuple _efficient_ at::ROCmFABackend::Ck) { //forward_attention_ck(...); std::cout << "In my branch" << std::endl; + std::optional out = std::nullopt; auto [out_, q, @@ -1163,15 +1164,15 @@ std::tuple _efficient_ value, dropout_p, false, // return dropout_randval - scale, custom_mask_type == 0 ? false : true, // is_causal + scale, bias, - res, + out, std::nullopt, // cu_seqlens_q: sending in nothing since CKFA works this way std::nullopt, // cu_seqlens_k seqstart_q, seqstart_k, - gen_); + std::nullopt);// not passing in optional gen_ // not passing in optional seqused_k_ // not passing in optional alibi_slopes_ From af6d8792fccb41967227d4550db3ee5e09a6af09 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 5 Feb 2025 16:42:24 +0000 Subject: [PATCH 18/46] feed attn_bias to mha_fwd* (pre-compile) --- .../native/transformers/cuda/attention.cu | 2 +- .../hip/flash_attn/ck/me_fwd_ck.hip | 60 ++++++++++--------- .../hip/flash_attn/ck/mha_fwd_ck.hip | 3 +- .../hip/flash_attn/ck/mha_varlen_fwd_ck.hip | 3 +- .../transformers/hip/flash_attn/flash_api.h | 8 ++- 5 files changed, 41 insertions(+), 35 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 989442696935..6f0bd861de1f 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1148,7 +1148,7 @@ std::tuple _efficient_ at::ROCmFABackend::Ck) { //forward_attention_ck(...); std::cout << "In my branch" << std::endl; - std::optional out = std::nullopt; + std::optional out = std::nullopt; auto [out_, q, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index c36e2210816e..b89d8d94204a 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -21,7 +21,7 @@ mem_eff_forward_ck( const bool return_dropout_randval, const std::optional is_causal, const std::optional scale, - const std::optional& attn_bias, + const std::optional& attn_bias_, std::optional& out_, const std::optional& cu_seqlens_q, const std::optional& cu_seqlens_k, @@ -45,18 +45,19 @@ mem_eff_forward_ck( // need to pass attn_bias to both of these if(!cu_seqlens_q.has_value()){ return mha_fwd_ck( - q, // q - k, // k - v, // v - out_, // opt(out_) - alibi_slopes_, // opt(alibi_slopes) - p_dropout, // p_dropout - scale.value(), // opt(softmax_scale) - is_causal.value(), // opt(is_causal) - non_null_window_left, // window_size_left + q, // q + k, // k + v, // v + out_, // opt(out_) + alibi_slopes_, // opt(alibi_slopes) + p_dropout, // p_dropout + scale.value(), // opt(softmax_scale) + is_causal.value(), // opt(is_causal) + non_null_window_left, // window_size_left non_null_window_right, // window_size_right - false, // return_softmax/return_debug_mask - gen_); // gen + false, // return_softmax/return_debug_mask + gen_, // gen + attn_bias_); // attn_bias } else { // max sequence lengths are now at T.size(1) since q,k,v were all transposed // in _scaled_dot_product_efficient_attention_cuda @@ -64,24 +65,25 @@ mem_eff_forward_ck( const int64_t max_seqlen_k = k.size(1); return mha_varlen_fwd_ck( - q, // q - k, // k - v, // v - out_, // opt(out) - cu_seqlens_q.value(), // cu_seqlens_q - cu_seqlens_k.value(), // cu_seqlens_k - seqused_k_, // opt(seqused_k) - alibi_slopes_, // opt(alibi_slopes) - max_seqlen_q, // max_seqlen_q - max_seqlen_k, // max_seqlen_k - p_dropout, // p_dropout - scale.value(),// softmax_scale - false, // zero_tensors - is_causal.value(), // is_causal - non_null_window_left, // window_size_left + q, // q + k, // k + v, // v + out_, // opt(out) + cu_seqlens_q.value(), // cu_seqlens_q + cu_seqlens_k.value(), // cu_seqlens_k + seqused_k_, // opt(seqused_k) + alibi_slopes_, // opt(alibi_slopes) + max_seqlen_q, // max_seqlen_q + max_seqlen_k, // max_seqlen_k + p_dropout, // p_dropout + scale.value(), // softmax_scale + false, // zero_tensors + is_causal.value(), // is_causal + non_null_window_left, // window_size_left non_null_window_right, // window_size_right - false, // return_softmax/return_debug_mask - gen_);// gen + false, // return_softmax/return_debug_mask + gen_, // gen + attn_bias_); // attn_bias } } diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index 4d7726cec8fd..ddf1f9a9d34f 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -155,7 +155,8 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x int window_size_left, int window_size_right, const bool return_dropout_randval, - std::optional gen_) + std::optional gen_, + std::optional& attn_bias_) { auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip index 2ef2ab24d9a6..c5bc8a2e1d35 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip @@ -163,7 +163,8 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads int window_size_left, int window_size_right, const bool return_dropout_randval, - std::optional gen_) + std::optional gen_, + std::optional& attn_bias_) { auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index 565a2b2ff8cf..ff15593ce5a3 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -151,7 +151,8 @@ mha_fwd_ck( int window_size_left, int window_size_right, const bool return_softmax, - std::optional gen_); + std::optional gen_, + std::optional& attn_bias_); std::tuple< at::Tensor, @@ -186,7 +187,8 @@ mha_varlen_fwd_ck( int window_size_left, int window_size_right, const bool return_softmax, - std::optional gen_); + std::optional gen_, + std::optional& attn_bias_); std::tuple mha_bwd_ck( const at::Tensor& dout, // batch_size x seqlen_q x num_heads, x head_size_og @@ -642,7 +644,7 @@ mem_eff_forward_ck( const bool return_dropout_randval, const std::optional is_causal, const std::optional scale, - const std::optional& attn_bias, + const std::optional& attn_bias_, std::optional& out_, const std::optional& cu_seqlens_q, const std::optional& cu_seqlens_k, From 2599a68acc1b217d4dfd3faf41171e50944bcb2e Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 5 Feb 2025 18:32:15 +0000 Subject: [PATCH 19/46] feed attn_bias to mha_fwd* (post-compile) --- .../transformers/hip/flash_attn/ck/mha_fwd_ck.hip | 2 +- .../hip/flash_attn/ck/mha_varlen_fwd_ck.hip | 2 +- .../native/transformers/hip/flash_attn/flash_api.h | 12 ++++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index ddf1f9a9d34f..f7d2d304b8df 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -156,7 +156,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x int window_size_right, const bool return_dropout_randval, std::optional gen_, - std::optional& attn_bias_) + const std::optional& attn_bias_) { auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip index c5bc8a2e1d35..e6b95824f84f 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip @@ -164,7 +164,7 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads int window_size_right, const bool return_dropout_randval, std::optional gen_, - std::optional& attn_bias_) + const std::optional& attn_bias_) { auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index ff15593ce5a3..f9063179cbb2 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -152,7 +152,7 @@ mha_fwd_ck( int window_size_right, const bool return_softmax, std::optional gen_, - std::optional& attn_bias_); + const std::optional& attn_bias_); std::tuple< at::Tensor, @@ -188,7 +188,7 @@ mha_varlen_fwd_ck( int window_size_right, const bool return_softmax, std::optional gen_, - std::optional& attn_bias_); + const std::optional& attn_bias_); std::tuple mha_bwd_ck( const at::Tensor& dout, // batch_size x seqlen_q x num_heads, x head_size_og @@ -274,6 +274,7 @@ mha_fwd( #if defined(USE_CK_FLASH_ATTENTION) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { + std::optional dummy_attn_bias = std::nullopt; return mha_fwd_ck( q, k, @@ -286,7 +287,8 @@ mha_fwd( window_size_left, window_size_right, return_softmax, - gen_); + gen_, + dummy_attn_bias); // Not used in flash attention } else { return mha_fwd_aot( q, @@ -358,6 +360,7 @@ mha_varlen_fwd( #if defined(USE_CK_FLASH_ATTENTION) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { + std::optional dummy_attn_bias = std::nullopt; return mha_varlen_fwd_ck( q, k, @@ -376,7 +379,8 @@ mha_varlen_fwd( window_size_left, window_size_right, return_softmax, - gen_); + gen_, + dummy_attn_bias); // Not used in flash attention } else { return mha_varlen_fwd_aot( q, From 26d71a86b2c86769d1b66fe12a80c616effb2e0a Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 5 Feb 2025 22:45:12 +0000 Subject: [PATCH 20/46] fighting linker runtime error --- aten/src/ATen/native/transformers/cuda/attention.cu | 12 +++++++++--- .../transformers/hip/flash_attn/ck/me_fwd_ck.hip | 10 ++++------ .../native/transformers/hip/flash_attn/flash_api.h | 10 ++++++---- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 6f0bd861de1f..e8e0baec0a5b 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1144,11 +1144,17 @@ std::tuple _efficient_ #ifdef USE_ROCM // ROCM Implementation + if( bias.has_value() ) { + std::cout << std::endl; + std::cout << "Attn_bias sizes: " << bias.value().sizes() << std::endl; + } if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { //forward_attention_ck(...); std::cout << "In my branch" << std::endl; std::optional out = std::nullopt; + std::optional seqused_k = std::nullopt; + std::optional alibi_slopes = std::nullopt; auto [out_, q, @@ -1172,9 +1178,9 @@ std::tuple _efficient_ std::nullopt, // cu_seqlens_k seqstart_q, seqstart_k, - std::nullopt);// not passing in optional gen_ - // not passing in optional seqused_k_ - // not passing in optional alibi_slopes_ + std::nullopt,// not passing in optional gen_ + seqused_k,// not passing in optional seqused_k_ + alibi_slopes);// not passing in optional alibi_slopes_ } else { // use aotriton diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index b89d8d94204a..f4ad8c9b95bb 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -2,8 +2,6 @@ namespace pytorch_flash { - - std::tuple< at::Tensor, // output at::Tensor, // q @@ -17,10 +15,10 @@ mem_eff_forward_ck( const at::Tensor& q, const at::Tensor& k, const at::Tensor& v, - const float p_dropout, - const bool return_dropout_randval, - const std::optional is_causal, - const std::optional scale, + float p_dropout, + bool return_dropout_randval, + std::optional is_causal, + std::optional scale, const std::optional& attn_bias_, std::optional& out_, const std::optional& cu_seqlens_q, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index f9063179cbb2..e618f55b46b3 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -644,17 +644,19 @@ mem_eff_forward_ck( const at::Tensor& q, const at::Tensor& k, const at::Tensor& v, - const float p_dropout, + float p_dropout, const bool return_dropout_randval, - const std::optional is_causal, - const std::optional scale, + std::optional is_causal, + std::optional scale, const std::optional& attn_bias_, std::optional& out_, const std::optional& cu_seqlens_q, const std::optional& cu_seqlens_k, const std::optional& seqstart_q, const std::optional& seqstart_k, - std::optional gen_ + std::optional gen_, + std::optional& seqused_k_, + std::optional& alibi_slopes_ ); From ab012d69e0bea7ae3538f35edde592043f1d9f02 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 6 Feb 2025 16:28:08 +0000 Subject: [PATCH 21/46] Move declaration to new header --- .../native/transformers/cuda/attention.cu | 4 ++ .../hip/flash_attn/ck/me_ck_api.h | 38 +++++++++++++++++++ .../hip/flash_attn/ck/me_fwd_ck.hip | 2 +- .../transformers/hip/flash_attn/flash_api.h | 4 +- 4 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index e8e0baec0a5b..f486de3109da 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -77,11 +77,15 @@ #include #include #include + + + #else // MemoryEfficient Attention Specific Imports for ROCM #include #include #include +#include #endif #endif diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h new file mode 100644 index 000000000000..1593ff9944dc --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -0,0 +1,38 @@ +#pragma once +#include + +#include + + +namespace pytorch_flash { + +std::tuple< + at::Tensor, // output + at::Tensor, // q + at::Tensor, // k + at::Tensor, // v + at::Tensor, // lse + at::Tensor, // seed + at::Tensor, // offset + at::Tensor> // dropout randval +mem_eff_forward_ck( + const at::Tensor& q, + const at::Tensor& k, + const at::Tensor& v, + float p_dropout, + const bool return_dropout_randval, + std::optional is_causal, + std::optional scale, + const std::optional& attn_bias_, + std::optional& out_, + const std::optional& cu_seqlens_q, + const std::optional& cu_seqlens_k, + const std::optional& seqstart_q, + const std::optional& seqstart_k, + std::optional gen_, + std::optional& seqused_k_, + std::optional& alibi_slopes_ +); + + +} // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index f4ad8c9b95bb..f530ef5721fe 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -1,5 +1,5 @@ #include - +#include namespace pytorch_flash { std::tuple< diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index e618f55b46b3..aee9d9e1f48b 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -630,7 +630,7 @@ inline std::tuple mha_varlen_bwd philox_offset); #endif } - +/* std::tuple< at::Tensor, // output at::Tensor, // q @@ -658,7 +658,7 @@ mem_eff_forward_ck( std::optional& seqused_k_, std::optional& alibi_slopes_ ); - +*/ From b09806ef8d3904a619e6620b90d4279312b6c0ca Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 6 Feb 2025 17:45:56 +0000 Subject: [PATCH 22/46] Compiled linker error and not hitting runtime error --- .../ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h | 4 ++-- .../ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index 1593ff9944dc..0fac19071ff7 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -20,9 +20,9 @@ mem_eff_forward_ck( const at::Tensor& k, const at::Tensor& v, float p_dropout, - const bool return_dropout_randval, + bool return_dropout_randval, std::optional is_causal, - std::optional scale, + std::optional scale, const std::optional& attn_bias_, std::optional& out_, const std::optional& cu_seqlens_q, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index f530ef5721fe..e1f98177fb86 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -27,7 +27,7 @@ mem_eff_forward_ck( const std::optional& seqstart_k, std::optional gen_, std::optional& seqused_k_, - std::optional & alibi_slopes_) { + std::optional& alibi_slopes_) { // These normally get passed in as std::nullopt so just set to -1 From 6bee61e6a479c65d7cf2485a6b0e0df26b0b0d51 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 6 Feb 2025 20:16:35 +0000 Subject: [PATCH 23/46] descriptive changes to just test_transformers.py DELETE LATER --- test/test_transformers.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/test_transformers.py b/test/test_transformers.py index af711a6fb67e..f8dfbfff9b50 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -2601,8 +2601,14 @@ def test_attention(backend: SDPBackend, permute_order: list[list[int]]): def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]): dtype = torch.float16 make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True) - batch, num_heads, head_dim = 8, 8, 64 - seq_len_q, seq_len_kv = 64, 15 + batch, num_heads, head_dim = 1, 4, 8 + seq_len_q, seq_len_kv = 16, 32 + print("") + print("batch : " , batch) + print("nheads : " , num_heads) + print("hdim : " , head_dim) + print("seqlen_q : " , seq_len_q) + print("seqlen_kv : " , seq_len_kv) query = make_tensor(SdpaShape(batch, num_heads, seq_len_q, head_dim)) kv_shape = SdpaShape(batch, num_heads, seq_len_kv, head_dim) key, value = make_tensor(kv_shape), make_tensor(kv_shape) From 6700952a70e81b1ff1c5f8e10d553014d6232f2a Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 6 Feb 2025 20:34:27 +0000 Subject: [PATCH 24/46] some logging --- .../ATen/native/transformers/cuda/attention.cu | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index f486de3109da..3c5363fabd06 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -861,10 +861,15 @@ std::tuple _scaled_dot_product_efficient_attenti // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head) // Key -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head) // Value -> Value(Batch x KV_seq_len x Num_heads x Dim_per_head) + std::cout << "sdpa_ef" << std::endl; + std::cout << "q.sizes : " << query.sizes() << std::endl; Tensor q_t = query.transpose(1, 2); Tensor k_t = key.transpose(1, 2); Tensor v_t = value.transpose(1, 2); + std::cout << "q_t.sizes: " << q_t.sizes() << std::endl; + + std::cout << "qagain.sizes: " << query.sizes() << std::endl; sdp::CustomMaskType custom_mask_type = is_causal ? sdp::CustomMaskType::CausalFromTopLeft : sdp::CustomMaskType::NoCustomMask; @@ -1150,8 +1155,13 @@ std::tuple _efficient_ // ROCM Implementation if( bias.has_value() ) { std::cout << std::endl; - std::cout << "Attn_bias sizes: " << bias.value().sizes() << std::endl; + std::cout << "Attn_bias sizes : " << bias.value().sizes() << std::endl; + std::cout << "attn_bias device: " << bias.value().device() << std::endl; } + + // Need this in both aot and CK case + const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { //forward_attention_ck(...); @@ -1159,6 +1169,8 @@ std::tuple _efficient_ std::optional out = std::nullopt; std::optional seqused_k = std::nullopt; std::optional alibi_slopes = std::nullopt; + + auto [out_, q, @@ -1175,7 +1187,7 @@ std::tuple _efficient_ dropout_p, false, // return dropout_randval custom_mask_type == 0 ? false : true, // is_causal - scale, + softmax_scale, bias, out, std::nullopt, // cu_seqlens_q: sending in nothing since CKFA works this way @@ -1217,7 +1229,6 @@ std::tuple _efficient_ TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); } - const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); using aotriton::v2::flash::attn_fwd; using aotriton::v2::flash::attn_fwd_compact_varlen; From 518c0cb1534288033aebc86fb61e2277fd713d30 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Fri, 7 Feb 2025 18:54:21 +0000 Subject: [PATCH 25/46] Finished initial implementation for fwd (post-compile, pre-run) --- .../native/transformers/cuda/attention.cu | 5 +- .../hip/flash_attn/ck/mha_fwd_ck.hip | 59 ++++++++++++------- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 3c5363fabd06..d487055d4cb2 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1157,6 +1157,7 @@ std::tuple _efficient_ std::cout << std::endl; std::cout << "Attn_bias sizes : " << bias.value().sizes() << std::endl; std::cout << "attn_bias device: " << bias.value().device() << std::endl; + std::cout << "last dim stride: " << bias.value().stride(-1) << std::endl; } // Need this in both aot and CK case @@ -1166,7 +1167,7 @@ std::tuple _efficient_ at::ROCmFABackend::Ck) { //forward_attention_ck(...); std::cout << "In my branch" << std::endl; - std::optional out = std::nullopt; + std::optional out(res); std::optional seqused_k = std::nullopt; std::optional alibi_slopes = std::nullopt; @@ -1176,7 +1177,7 @@ std::tuple _efficient_ q, k, v, - softmax_lse, + logsumexp, seed_t, offset_t, p] = diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index f7d2d304b8df..64fd5d266494 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -6,7 +6,7 @@ #include #include - +#include namespace pytorch_flash { @@ -16,7 +16,7 @@ fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask, int head_size, bool has_dropout, bool has_lse, - bool enable_alibi) + bool enable_bias) { return fmha_fwd_traits{head_size, head_size, @@ -24,7 +24,7 @@ fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask, false, // is_group_mode true, // is_v_rowmajor mask.type, - enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias, has_lse, has_dropout, false}; // do_fp8_static_quant @@ -44,7 +44,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, const at::Tensor q, const at::Tensor k, const at::Tensor v, - std::optional &alibi_slopes_, + std::optional &attn_bias_, at::Tensor out, at::Tensor softmax_lse, at::Tensor dropout_randval, @@ -58,6 +58,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, // o: (batch_size, seqlen_q, nheads, d) // alibi_slopes:(batch_size, nheads) or (nhead) + // attn_bias: (batch_size, nheads, seqlen_q, seqlen_k) // lse: (batch_size, nheads, seqlen_q) // randval: (batch_size, nheads, seqlen_q, seqlen_k) @@ -82,22 +83,30 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0; ck_tile::index_t batch_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0; - void *alibi_slopes_ptr = nullptr; - ck_tile::index_t stride_alibi_slopes = 0; + void *attn_bias_ptr = nullptr; + ck_tile::index_t stride_attn_bias = 0; + + if (attn_bias_.has_value()) { + auto a_b = attn_bias_.value(); + CHECK_DEVICE(a_b); + TORCH_CHECK(a_b.stride(-1) == 1, "attention bias tensor must have contiguous last dimension"); + // Following check was a remnant of the expectation that the bias tensor would be alibi which only has 2 dimensions + // whereas what pytorch provides (elementwise bias) is a 4 dimensional tensor of the shape {b, h, s_q, s_k} + //TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({h}) || alibi_slopes.sizes() == at::IntArrayRef({b, h})); + attn_bias_ptr = a_b.data_ptr(); + // Previously, looks like alibi slopes could be 1 of 2 shapes. either {h} or {b, h} so this check makes sure + // to grab the batch size if there are 2 dimensions, and a stride of zero if there's only one dimension. + // so assuming we are getting what pytorch wants to give us which is the aforementioned 4-d tensor, just grab the batch + // dimension + //stride_attn_bias = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + stride_attn_bias = a_b.stride(0); - if (alibi_slopes_.has_value()) { - auto alibi_slopes = alibi_slopes_.value(); - CHECK_DEVICE(alibi_slopes); - TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); - TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({h}) || alibi_slopes.sizes() == at::IntArrayRef({b, h})); - alibi_slopes_ptr = alibi_slopes.data_ptr(); - stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; } return fmha_fwd_args{q.data_ptr(), k.data_ptr(), v.data_ptr(), - alibi_slopes_ptr, // bias + attn_bias_ptr, // bias has_dropout_randval ? dropout_randval.data_ptr() : nullptr, has_lse ? softmax_lse.data_ptr() : nullptr, out.data_ptr(), @@ -118,7 +127,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, stride_q, stride_k, stride_v, - stride_alibi_slopes, + stride_attn_bias, stride_randval, stride_o, nhead_stride_q, @@ -148,7 +157,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size std::optional &out_, // batch_size x seqlen_q x num_heads xhead_size - std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads: Not used const float p_dropout, const float softmax_scale, bool is_causal, @@ -156,7 +165,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x int window_size_right, const bool return_dropout_randval, std::optional gen_, - const std::optional& attn_bias_) + const std::optional& attn_bias_) // batch_size x nheads x seqlen_q x seqlen_k { auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16, @@ -190,7 +199,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x if (window_size_right >= seqlen_k) { window_size_right = -1; } // causal=true is the same as causal=false in this case - if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } + if (seqlen_q == 1 && !attn_bias_.has_value()) { is_causal = false; } mask_info mask; if (is_causal) { @@ -210,7 +219,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case // H/t Daniel Haziza - const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size % 8 == 0 && !alibi_slopes_.has_value(); + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size % 8 == 0 && !attn_bias_.has_value(); const int ngroups = num_heads / num_heads_k; at::Tensor temp_q = q; if (seqlenq_ngroups_swapped) { @@ -306,6 +315,14 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA)); } + // remove const from attn_bias_ + // TODO: sanity check this + std::optional attn_bias; + if( attn_bias_.has_value()) + { + attn_bias = attn_bias_; + } + if (seqlen_k > 0) { auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1); auto stream = at::cuda::getCurrentHIPStream().stream(); @@ -318,7 +335,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x head_size_8x, has_dropout, has_lse, - alibi_slopes_.has_value()); + attn_bias_.has_value()); auto args = get_ck_fmha_fwd_args( @@ -334,7 +351,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x q, k, v, - alibi_slopes_, + attn_bias, out, softmax_lse, p, From 20f5401fe8a303ed1d0de9c289904e38d52d28ae Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Fri, 7 Feb 2025 19:46:21 +0000 Subject: [PATCH 26/46] Saving place, BLOCKED on codegen --- aten/src/ATen/native/transformers/cuda/attention.cu | 6 ++++-- .../native/transformers/hip/flash_attn/ck/me_fwd_ck.hip | 5 ++++- test/test_transformers.py | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index d487055d4cb2..a8735d11dec7 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1162,7 +1162,9 @@ std::tuple _efficient_ // Need this in both aot and CK case const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + res = at::empty({B, M, num_heads, Kv}, query.options()); + std::cout << "CK Enabled?: " << at::globalContext().getROCmFAPreferredBackend() << std::endl; if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { //forward_attention_ck(...); @@ -1170,7 +1172,7 @@ std::tuple _efficient_ std::optional out(res); std::optional seqused_k = std::nullopt; std::optional alibi_slopes = std::nullopt; - + std::cout << "out(res) dtype " << out.value().dtype(); auto [out_, @@ -1212,7 +1214,6 @@ std::tuple _efficient_ // performance, but for now it requires compact logsumexp tensor, even if // compute_logsumexp is false constexpr int kAlignLSE = 1; - res = at::empty({B, M, num_heads, Kv}, query.options()); logsumexp = at::empty( { B, num_heads, max_seqlen_q }, query.options().dtype(at::ScalarType::Float)); @@ -1464,6 +1465,7 @@ std::tuple _efficient_ AT_CUDA_CHECK(cudaGetLastError()); #endif // USE_ROCM + std::cout << "res dtype: " << res.dtype() << std::endl; return std::make_tuple( std::move(res), std::move(logsumexp), diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index e1f98177fb86..db8ca2d66f6b 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -1,6 +1,8 @@ #include #include +#include + namespace pytorch_flash { std::tuple< at::Tensor, // output @@ -29,7 +31,8 @@ mem_eff_forward_ck( std::optional& seqused_k_, std::optional& alibi_slopes_) { - + std::cout << std::endl; + std::cout << "MADE IT INTO MY CODE " << std::endl; // These normally get passed in as std::nullopt so just set to -1 // Note: See attention.cu ~line 928 and line 729 const int non_null_window_left = -1; diff --git a/test/test_transformers.py b/test/test_transformers.py index f8dfbfff9b50..f3ad8fb1edda 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -2599,6 +2599,7 @@ def test_attention(backend: SDPBackend, permute_order: list[list[int]]): @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system") @parametrize("mask_dim", [1, 2, 3, 4]) def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]): + torch.backends.cuda.preferred_rocm_fa_library("ck") dtype = torch.float16 make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True) batch, num_heads, head_dim = 1, 4, 8 From e484624a079e7d2f234766535c1d7aa11c221244 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 12 Feb 2025 20:40:19 +0000 Subject: [PATCH 27/46] debug traces DELETE LATER --- .../ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip | 1 + 1 file changed, 1 insertion(+) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index 64fd5d266494..7b4cd9ffec98 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -320,6 +320,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x std::optional attn_bias; if( attn_bias_.has_value()) { + std::cout << "CONFIRMED YOUR CODE IS GETTING HIT AND ATTENTION BIAS IS SET" << std::endl; attn_bias = attn_bias_; } From 8f9f16535bd13ab4a6519b4094cadd83f09a1fa5 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 12 Feb 2025 21:05:46 +0000 Subject: [PATCH 28/46] remove backward from test. START BWD AFTER THIS COMMIT --- test/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_transformers.py b/test/test_transformers.py index f3ad8fb1edda..cf8d8461938b 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -2624,7 +2624,7 @@ def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int] mask = torch.randn((batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype) with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]): out = F.scaled_dot_product_attention(query, key, value, mask) - out.sum().backward() + #out.sum().backward() @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system") @parametrize("dtype", [torch.float, torch.float16]) From 84e81977dbb019cdb238446b0f7e4d5cc0fb36a2 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 12 Feb 2025 22:43:21 +0000 Subject: [PATCH 29/46] First draft of bwd function signature (pre-compile) --- .../native/transformers/cuda/attention.cu | 3 -- .../hip/flash_attn/ck/me_bwd_ck.hip | 17 ++++++++++ .../hip/flash_attn/ck/me_ck_api.h | 32 +++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index a8735d11dec7..d6bec9e29f8f 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -77,9 +77,6 @@ #include #include #include - - - #else // MemoryEfficient Attention Specific Imports for ROCM #include diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip new file mode 100644 index 000000000000..c70f6de26551 --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -0,0 +1,17 @@ +#include +#include + +#include + +namespace pytorch_flash { +// TODO get return tensors correct +std::tuple< + at::Tensor> +mem_eff_backward_ck( +// TODO get input args correct +) { +// TODO implement wrapper +} + + +} // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index 0fac19071ff7..f38877009838 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -34,5 +34,37 @@ mem_eff_forward_ck( std::optional& alibi_slopes_ ); +// TODO get return tensors correct +std::tuple< + at::Tensor, // dQ + at::Tensor, // dK + at::Tensor, // dV + at::Tensor> // dBias +mem_eff_backward_ck( + const at::Tensor &dout, + const at::Tensor &q, + const at::Tensor &k, + const at::Tensor &v, + const at::Tensor &out, + const at::Tensor &softmax_lse, + std::optional &dq_, + std::optional &dk_, + std::optional &dv_, + std::optional &alibi_slopes_, + const at::Tensor &cu_seqlens_q, + const at::Tensor &cu_seqlens_k, + const int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float scale, + const bool is_causal, + const bool deterministic, + const bool zero_tensors, + const at::Tensor philox_seed, + const at::Tensor philox_offset) +{ +// TODO implement wrapper +} + } // namespace pytorch_flash From 7f003d1f5f295fc8692660c3039d4b4fc3e3dec3 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 13 Feb 2025 20:17:54 +0000 Subject: [PATCH 30/46] Calling my function as a no-op (pre-compile) --- .../transformers/cuda/attention_backward.cu | 39 ++++++++++++++++++- .../hip/flash_attn/ck/me_bwd_ck.hip | 32 +++++++++++++-- .../hip/flash_attn/ck/me_ck_api.h | 2 +- 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index 09799ff125d1..544468173886 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -47,6 +47,7 @@ #include #include #include +#include #endif #endif @@ -409,6 +410,42 @@ _efficient_attention_backward( #ifdef USE_ROCM // ROCM Implementation + if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) + { + std::cout "BACKWARD CK ATTENTION" << std::endl; + const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + auto + [grad_q, + grad_k, + grad_v, + grad_bias] = + pytorch_flash::mem_eff_backward_ck( + grad_out, + query, + key, + value, + out, + logsumexp, + grad_q, + grad_k, + grad_v, + bias, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + float(p_dropout), + softmax_scale, + custom_mask_type == 0 ? false : true, // is_causal + false, // deterministic + false, // zero_tensors + philox_seed, + philox_offset); + + } + + + // TODO_ANDY: Put this in the `else` part of the above condish TORCH_CHECK(!num_splits_key.has_value(), "ROCM does not support num_split_keys in _efficient_attention_forward"); TORCH_CHECK(!window_size.has_value(), @@ -492,7 +529,7 @@ _efficient_attention_backward( is_causal, stream); } -#else +#else // USE_CUDA at::Tensor workspace; cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index()); const int computeCapability = p->major * 10 + p->minor; diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index c70f6de26551..ca5d68fa5eff 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -4,13 +4,37 @@ #include namespace pytorch_flash { -// TODO get return tensors correct std::tuple< - at::Tensor> + at::Tensor, // dQ + at::Tensor, // dK + at::Tensor, // dV + at::Tensor> // dBias mem_eff_backward_ck( -// TODO get input args correct -) { + const at::Tensor &dout, + const at::Tensor &q, + const at::Tensor &k, + const at::Tensor &v, + const at::Tensor &out, + const at::Tensor &softmax_lse, + std::optional &dq_, + std::optional &dk_, + std::optional &dv_, + std::optional &attn_bias, + const at::Tensor &cu_seqlens_q, + const at::Tensor &cu_seqlens_k, + const int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float scale, + const bool is_causal, + const bool deterministic, + const bool zero_tensors, + const at::Tensor philox_seed, + const at::Tensor philox_offset) +{ // TODO implement wrapper + std::cout << "HIT MY MEM_EFF ENTRY POINT" << std::endl; + return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}); } diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index f38877009838..170b9f452c96 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -50,7 +50,7 @@ mem_eff_backward_ck( std::optional &dq_, std::optional &dk_, std::optional &dv_, - std::optional &alibi_slopes_, + std::optional &attn_bias, const at::Tensor &cu_seqlens_q, const at::Tensor &cu_seqlens_k, const int max_seqlen_q, From 11099abbec47522214ad0a5da9d63d5a43527841 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 13 Feb 2025 23:47:58 +0000 Subject: [PATCH 31/46] Calling my function as a no-op (post-compile) --- .../transformers/cuda/attention_backward.cu | 13 ++++---- .../hip/flash_attn/ck/me_bwd_ck.hip | 30 +++++++++---------- .../hip/flash_attn/ck/me_ck_api.h | 30 ++++++++----------- 3 files changed, 35 insertions(+), 38 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index 544468173886..fa933e7b2969 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -412,13 +412,14 @@ _efficient_attention_backward( // ROCM Implementation if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { - std::cout "BACKWARD CK ATTENTION" << std::endl; + std::cout << "BACKWARD CK ATTENTION" << std::endl; const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + // TODO_ANDY: make sure we are returning the same tensor that is in grad_X auto - [grad_q, - grad_k, - grad_v, - grad_bias] = + [dQ, + dK, + dV, + dBias] = pytorch_flash::mem_eff_backward_ck( grad_out, query, @@ -434,7 +435,7 @@ _efficient_attention_backward( cu_seqlens_k, max_seqlen_q, max_seqlen_k, - float(p_dropout), + float(dropout_p), softmax_scale, custom_mask_type == 0 ? false : true, // is_causal false, // deterministic diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index ca5d68fa5eff..3b94b17a6f5f 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -16,25 +16,25 @@ mem_eff_backward_ck( const at::Tensor &v, const at::Tensor &out, const at::Tensor &softmax_lse, - std::optional &dq_, - std::optional &dk_, - std::optional &dv_, + const at::Tensor &dq_, + const at::Tensor &dk_, + const at::Tensor &dv_, std::optional &attn_bias, - const at::Tensor &cu_seqlens_q, - const at::Tensor &cu_seqlens_k, - const int max_seqlen_q, - const int max_seqlen_k, - const float p_dropout, - const float scale, - const bool is_causal, - const bool deterministic, - const bool zero_tensors, - const at::Tensor philox_seed, - const at::Tensor philox_offset) + std::optional &cu_seqlens_q, + std::optional &cu_seqlens_k, + int max_seqlen_q, + int max_seqlen_k, + float p_dropout, + float scale, + bool is_causal, + bool deterministic, + bool zero_tensors, + at::Tensor philox_seed, + at::Tensor philox_offset) { // TODO implement wrapper std::cout << "HIT MY MEM_EFF ENTRY POINT" << std::endl; - return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}); + return std::make_tuple(at::Tensor{}, at::Tensor{}, at::Tensor{}, at::Tensor{}); } diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index 170b9f452c96..88e811590d64 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -47,24 +47,20 @@ mem_eff_backward_ck( const at::Tensor &v, const at::Tensor &out, const at::Tensor &softmax_lse, - std::optional &dq_, - std::optional &dk_, - std::optional &dv_, + const at::Tensor &dq_, + const at::Tensor &dk_, + const at::Tensor &dv_, std::optional &attn_bias, - const at::Tensor &cu_seqlens_q, - const at::Tensor &cu_seqlens_k, - const int max_seqlen_q, - const int max_seqlen_k, - const float p_dropout, - const float scale, - const bool is_causal, - const bool deterministic, - const bool zero_tensors, + std::optional &cu_seqlens_q, + std::optional &cu_seqlens_k, + int max_seqlen_q, + int max_seqlen_k, + float p_dropout, + float scale, + bool is_causal, + bool deterministic, + bool zero_tensors, const at::Tensor philox_seed, - const at::Tensor philox_offset) -{ -// TODO implement wrapper -} - + const at::Tensor philox_offset); } // namespace pytorch_flash From 371a907d0916580968a7d39a4abaa1eadd82a035 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Fri, 14 Feb 2025 00:36:08 +0000 Subject: [PATCH 32/46] getting ready to implement wrapper. just some comments for that --- .../native/transformers/cuda/attention_backward.cu | 10 +++++----- .../transformers/hip/flash_attn/ck/me_bwd_ck.hip | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index fa933e7b2969..ef1f8584c03f 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -410,10 +410,10 @@ _efficient_attention_backward( #ifdef USE_ROCM // ROCM Implementation - if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) - { +// if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) +// { std::cout << "BACKWARD CK ATTENTION" << std::endl; - const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float(); // TODO_ANDY: make sure we are returning the same tensor that is in grad_X auto [dQ, @@ -436,14 +436,14 @@ _efficient_attention_backward( max_seqlen_q, max_seqlen_k, float(dropout_p), - softmax_scale, + my_softmax_scale, custom_mask_type == 0 ? false : true, // is_causal false, // deterministic false, // zero_tensors philox_seed, philox_offset); - } +// } // TODO_ANDY: Put this in the `else` part of the above condish diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index 3b94b17a6f5f..966189470e4d 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -34,6 +34,14 @@ mem_eff_backward_ck( { // TODO implement wrapper std::cout << "HIT MY MEM_EFF ENTRY POINT" << std::endl; + if(!cu_seqlens_q.has_value()) { + // both of these return dq, dk, dv, softmax_d + // need to also return attn_bias + // call mha_bwd_ck + } else { + // call mha_varlen_bwd + } + return std::make_tuple(at::Tensor{}, at::Tensor{}, at::Tensor{}, at::Tensor{}); } From c73aed18d68e97d30cbd18d1b36260ab86106ef2 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Fri, 14 Feb 2025 21:47:51 +0000 Subject: [PATCH 33/46] calling mha stuff in wrapper (pre-compile) --- .../hip/flash_attn/ck/me_bwd_ck.hip | 63 ++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index 966189470e4d..0a0b9605f58e 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -34,12 +34,73 @@ mem_eff_backward_ck( { // TODO implement wrapper std::cout << "HIT MY MEM_EFF ENTRY POINT" << std::endl; + + const int non_null_window_left = -1; + const int non_null_window_right = -1; + if(!cu_seqlens_q.has_value()) { // both of these return dq, dk, dv, softmax_d // need to also return attn_bias // call mha_bwd_ck + auto + [dQ, + dK, + dV, + softmax_d] = + mha_bwd_ck( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + attn_bias, + p_dropout, + scale, + is_causal, + non_null_window_left, + non_null_window_right, + deterministic, + philox_seed, + philox_offset); + //TODO_ANDY: make this also return attention bias + return std::make_tuple(dQ, dK, dV, softmax_d); + } else { - // call mha_varlen_bwd + // call mha_varlen_bwd_ck + auto + [dQ, + dK, + dV, + softmax_d] = + mha_varlen_bwd_ck( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + cu_seqlens_q, + cu_seqlens_k, + attn_bias, + max_seqlen_q, + max_seqlen_k, + p_dropout, + scale, + is_causal, + non_null_window_left, + non_null_window_right, + deterministic, + philox_seed, + philox_offset); + return std::make_tuple(dQ, dK, dV, softmax_d); + } return std::make_tuple(at::Tensor{}, at::Tensor{}, at::Tensor{}, at::Tensor{}); From cc5db9f2818a186d622da4deab732fbbd3476aee Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Mon, 17 Feb 2025 18:38:41 +0000 Subject: [PATCH 34/46] calling mha stuff in wrapper (post-compile) --- .../hip/flash_attn/ck/me_bwd_ck.hip | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index 0a0b9605f58e..53e6c4b6409f 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -38,6 +38,12 @@ mem_eff_backward_ck( const int non_null_window_left = -1; const int non_null_window_right = -1; + // Wrap gradients in std::optional + std::optional opt_dQ, opt_dK, opt_dV; + opt_dQ = dq_; + opt_dK = dk_; + opt_dV = dv_; + if(!cu_seqlens_q.has_value()) { // both of these return dq, dk, dv, softmax_d // need to also return attn_bias @@ -54,9 +60,9 @@ mem_eff_backward_ck( v, out, softmax_lse, - dq_, - dk_, - dv_, + opt_dQ, + opt_dK, + opt_dV, attn_bias, p_dropout, scale, @@ -83,16 +89,17 @@ mem_eff_backward_ck( v, out, softmax_lse, - dq_, - dk_, - dv_, - cu_seqlens_q, - cu_seqlens_k, + opt_dQ, + opt_dK, + opt_dV, + cu_seqlens_q.value(), + cu_seqlens_k.value(), attn_bias, max_seqlen_q, max_seqlen_k, p_dropout, scale, + zero_tensors, is_causal, non_null_window_left, non_null_window_right, From 27e2bbd72247997351ef1b3599c608164aaa8f67 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 18 Feb 2025 19:54:43 +0000 Subject: [PATCH 35/46] Start feeding grad_bias through (post-compile) --- aten/src/ATen/native/transformers/cuda/attention_backward.cu | 4 ++++ .../ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip | 2 ++ .../ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h | 2 ++ 3 files changed, 8 insertions(+) diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index ef1f8584c03f..89fd19383785 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -414,6 +414,8 @@ _efficient_attention_backward( // { std::cout << "BACKWARD CK ATTENTION" << std::endl; const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + // Store grad_bias in optional + std::optional opt_grad_bias = grad_bias; // TODO_ANDY: make sure we are returning the same tensor that is in grad_X auto [dQ, @@ -431,6 +433,8 @@ _efficient_attention_backward( grad_k, grad_v, bias, + bias_requires_grad, + opt_grad_bias, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index 53e6c4b6409f..ef5a0e9e301c 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -20,6 +20,8 @@ mem_eff_backward_ck( const at::Tensor &dk_, const at::Tensor &dv_, std::optional &attn_bias, + bool bias_requires_grad, + std::optional &grad_bias, std::optional &cu_seqlens_q, std::optional &cu_seqlens_k, int max_seqlen_q, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index 88e811590d64..439073c2c631 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -51,6 +51,8 @@ mem_eff_backward_ck( const at::Tensor &dk_, const at::Tensor &dv_, std::optional &attn_bias, + bool bias_requires_grad, + std::optional &grad_bias, std::optional &cu_seqlens_q, std::optional &cu_seqlens_k, int max_seqlen_q, From fc1c4189b6470e157f00133116ab12e7dfd580fb Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 18 Feb 2025 20:27:18 +0000 Subject: [PATCH 36/46] Remove unneeded re-naming scripts --- .../ck/rename_ck_autogen_files.output.txt | 1810 ----------------- .../flash_attn/ck/rename_ck_autogen_files.sh | 11 - 2 files changed, 1821 deletions(-) delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.output.txt delete mode 100644 aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.sh diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.output.txt b/aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.output.txt deleted file mode 100644 index 78f844fd2a1e..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.output.txt +++ /dev/null @@ -1,1810 +0,0 @@ -fmha_bwd_api.hip -> fmha_ck_autogen_5919133d2ed892745013b2fc5d503414cf0a4d83.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2.hip -> fmha_ck_autogen_e11a3b7d4fdfed64e64f7a95dbc64eff541092d6.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_deterministic.hip -> fmha_ck_autogen_01cb354dddef6e99e4ac843f2adafcddfc58d520.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_pd.hip -> fmha_ck_autogen_1b3e7c8969027d3316875f33dc50fe022e05ce37.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_38273a2f8e6bbb42ba0b0871b6c95abb34531f33.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_ps.hip -> fmha_ck_autogen_2d43460c011b8d5e01ea98c9b8ddce962de59a96.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_4c0c50a1fac82d47dff2357ee3ddbfa0b2c8d487.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_psd.hip -> fmha_ck_autogen_2a3a980a26682d879c3a3425f3ba5be3f5761adf.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_008f2429c678d13386a06e8d8b15c4b480940ff3.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_group_o2_ps.hip -> fmha_ck_autogen_811db756577b61cde9fe8279d956980db9ee21a4.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_group_o2_ps_deterministic.hip -> fmha_ck_autogen_492fbc418e829f89bcb8d93f8afd2869dd8dfccc.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_group_o2_psd.hip -> fmha_ck_autogen_75f2010bf6c478d2f0eba77e912697661306c1cb.hip -fmha_bwd_convert_dq_d128_bf16_b64x128_group_o2_psd_deterministic.hip -> fmha_ck_autogen_0153ec18d3ded0f8bdc6459ea5757ebd94d9faf2.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2.hip -> fmha_ck_autogen_3eb2ea922daabbba131b90713e06d8caf5f30662.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_deterministic.hip -> fmha_ck_autogen_c0f76aff077c28f8afd7b22f284cf2894e08a043.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_pd.hip -> fmha_ck_autogen_f48f8b681a405bfeba5aadaef40f32367ec5cd2b.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_4cabdafad0bf803223ba5e8f474cd59233dc48cb.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_ps.hip -> fmha_ck_autogen_0801c56831b4c6428200db6318638a2129bb197a.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_91b9e2616c2fe0480096b1ccf0f74d584b220146.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_psd.hip -> fmha_ck_autogen_4f1e1c969b57659e7e1367ac9ba10ed5ef5b69a9.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_ecd7dec90b3c62bf3a30bd75d3c6869529a06b01.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_group_o2_ps.hip -> fmha_ck_autogen_88ea5b5346c87cc4fc1e841c518080df4ab811a2.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_group_o2_ps_deterministic.hip -> fmha_ck_autogen_4395d3c96b3f4556b9765fd0a3b5701b2fb10948.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_group_o2_psd.hip -> fmha_ck_autogen_b8fbc6f6e9c515edce3c7a438b3bc308b30d3857.hip -fmha_bwd_convert_dq_d128_fp16_b64x128_group_o2_psd_deterministic.hip -> fmha_ck_autogen_490a68220a7b621ae9817d7b77f55de239b0a4f3.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2.hip -> fmha_ck_autogen_344932e2655d7b32704be8de9a63bbd8c3369f02.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_deterministic.hip -> fmha_ck_autogen_5a85ae0a16e4b293b549bcb6a3ee52df7fccca32.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_pd.hip -> fmha_ck_autogen_963986150adcd6e1d3886bacf2166de1252e14df.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_8bd1a40b12ce927323594fcce61eb9c20cc5e3d4.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_ps.hip -> fmha_ck_autogen_296c5836ba118969c4ba89ed62a98dffe3105738.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_6cfb7075345704340ff33dc0ef7c04ef127f26ad.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_psd.hip -> fmha_ck_autogen_22511de2592b6e350737e44865e1fed6496e3f32.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_609f68180582384ba81aae2b1d4a4c52dde2c68c.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_group_o2_ps.hip -> fmha_ck_autogen_c9fe51f982abd60e567d4238d3266fb60e45814b.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_group_o2_ps_deterministic.hip -> fmha_ck_autogen_10a055e5c3d6a953d470db5dc21449766248058a.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_group_o2_psd.hip -> fmha_ck_autogen_327e27892bc57f3dec0da24f94f2a483d6c9321b.hip -fmha_bwd_convert_dq_d256_bf16_b64x64_group_o2_psd_deterministic.hip -> fmha_ck_autogen_c581974c8b6f43f60d0af29c350d850b55c03121.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2.hip -> fmha_ck_autogen_01ac1a2ecf9a487809e46faa92e267df2d47de91.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_deterministic.hip -> fmha_ck_autogen_dbc4135fce01e8731fec7a78d0cc0fdeeae28b90.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_pd.hip -> fmha_ck_autogen_e09d9baa269dfbb30b714389d1733be51cc419b7.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_5f71e663978dbcba859c5114ec675a712e343fd6.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_ps.hip -> fmha_ck_autogen_d257148f457557ea80ca56690e525db3a4b0ff55.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_8e2c587db8bd9f1b551624e0cf8b67a90245d7da.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_psd.hip -> fmha_ck_autogen_8c13c4f3f645a2bb475eb1c55ce1de452f0e2332.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_7b7fa76609243a8709f349ffc0d9d88157f28dc9.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_group_o2_ps.hip -> fmha_ck_autogen_2b3326e055da32cc979892a2fbd0f7b003cb9f98.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_group_o2_ps_deterministic.hip -> fmha_ck_autogen_671828f15eec2a58be23063a1a8132d337cd26de.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_group_o2_psd.hip -> fmha_ck_autogen_457eaffbff3c58183a656687010daa2c16cfc26e.hip -fmha_bwd_convert_dq_d256_fp16_b64x64_group_o2_psd_deterministic.hip -> fmha_ck_autogen_d18727988e47264b42b4153dc82fc1a750f08db0.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2.hip -> fmha_ck_autogen_ab6cd5c9242f8278c8f3d9ce57b97d605c7e5a3e.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_deterministic.hip -> fmha_ck_autogen_0c93c65e5942a2f43f2e491547add02777dd2eee.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_pd.hip -> fmha_ck_autogen_d32c64ef01aa228277d031a74df51363f98aa2b0.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_e5c5079636a4a31a849ce8a5af89d50330a74628.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_ps.hip -> fmha_ck_autogen_ea62567e9ea16771d8445464c38f5a2931cb355a.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_c6e2da8b791d31f4ba05ef5f833fd6dea9e35f1c.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_psd.hip -> fmha_ck_autogen_f731289837f915e2aec1bd01eef1b3c1b099864d.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_233132e712eba8972ba444c604f89e01c5b84cc0.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_group_o2_ps.hip -> fmha_ck_autogen_afc4b47a6fa62a4ca5cff6a7e01c9f6b371d2215.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_group_o2_ps_deterministic.hip -> fmha_ck_autogen_bec30e7107c5dce3fe6aa87d83ed96da75478da0.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_group_o2_psd.hip -> fmha_ck_autogen_f4658c32d562f9d60c5ca1262a2e0df2375063bb.hip -fmha_bwd_convert_dq_d32_bf16_b64x128_group_o2_psd_deterministic.hip -> fmha_ck_autogen_9545f95c1093c60f0fb6c794636f79aaeb53b733.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2.hip -> fmha_ck_autogen_e6b53fb8d81148ff384d31a703bb4c2e7a5a33af.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_deterministic.hip -> fmha_ck_autogen_7aa14aa94d625b33df1adfa30ef4d91769592608.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_pd.hip -> fmha_ck_autogen_b5db3d5b1d8af89381fc4b8073f84c5fa25fdef5.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_e8a9427f34bbf5ddb28a39161acc36806e68f2d0.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_ps.hip -> fmha_ck_autogen_724d1d4408196d611b2e0535bf8833652acbd6ef.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_a3ac4f93722dc314086f1b7d7b8adc687cd75f82.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_psd.hip -> fmha_ck_autogen_377b70f54cb2778b5ce3df936b477f775eea8b3c.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_5f20263fd84776f155519b3481be5e2c5b035585.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_group_o2_ps.hip -> fmha_ck_autogen_9745b04a8026a01828c5dd606d89d044d3ed1d99.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_group_o2_ps_deterministic.hip -> fmha_ck_autogen_a7784b03ad757d51c234fa86ea9891f055ecd5c1.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_group_o2_psd.hip -> fmha_ck_autogen_22105635385fbfb5d2f330df83ba6747bcb27f6d.hip -fmha_bwd_convert_dq_d32_fp16_b64x128_group_o2_psd_deterministic.hip -> fmha_ck_autogen_3afbb5ac9048a962a60f48886728220ae6c2aeaf.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2.hip -> fmha_ck_autogen_429b82a27571ac91e3631cbdb7e0a58155abf962.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_deterministic.hip -> fmha_ck_autogen_dc818f3ce244743cb1dbff9aca399df90742a6d0.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_pd.hip -> fmha_ck_autogen_7f9403cb91d6aabebf081afae94a8ba397d8d24f.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_ca5681d4e5871aacef74bdba9e368445875252d3.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_ps.hip -> fmha_ck_autogen_1e7d7888480b83c78833214b32e10f37a6e20301.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_4018f690b6322588041bb467beabd8a7bc79a2e0.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_psd.hip -> fmha_ck_autogen_23047ea90076e3b0a3eb0586d49b9ee74ca6d279.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_5a216f777feec4752f5882677b18168225da4b53.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_group_o2_ps.hip -> fmha_ck_autogen_fd19d7614f2ed5da21a52ed172ef62cc07c9c01a.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_group_o2_ps_deterministic.hip -> fmha_ck_autogen_9893336a4b00b2a63f23ed7e13ec54c82d9e5063.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_group_o2_psd.hip -> fmha_ck_autogen_131c1fdc4206bb952b2fea675f24e3b09f605eef.hip -fmha_bwd_convert_dq_d64_bf16_b64x128_group_o2_psd_deterministic.hip -> fmha_ck_autogen_cc4ac5a18f57f2ebb65f7e356e858ab0d59b2133.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2.hip -> fmha_ck_autogen_dde93ffe7fca311e136e42fbcd12b05c9fc7174c.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_deterministic.hip -> fmha_ck_autogen_7b67045d438a7e4b8f3a313a5df5a85f351c1be5.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_pd.hip -> fmha_ck_autogen_9689ecd7bf51bcffe9f5002959bdda41c50a3c8b.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_pd_deterministic.hip -> fmha_ck_autogen_c41b6eda4f250da059fe0c428428219ff5a250ef.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_ps.hip -> fmha_ck_autogen_c45a5e40f6a66bc5292a56e0097c69fe37cedfb3.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_ps_deterministic.hip -> fmha_ck_autogen_ffc6056d9fe125a4dbe08c1d86354e51f7daadd5.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_psd.hip -> fmha_ck_autogen_2995d39cd62f20622a31f11a292ed175abb5fdf9.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_batch_o2_psd_deterministic.hip -> fmha_ck_autogen_cb10303a0b79f2710eb7c66896d3c1f8b12c04dd.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_group_o2_ps.hip -> fmha_ck_autogen_81dd3ea61bb61de02667b14f5a94198f48c7307b.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_group_o2_ps_deterministic.hip -> fmha_ck_autogen_d3af8763f289dace1054bdcb4dfeda28b0aefcae.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_group_o2_psd.hip -> fmha_ck_autogen_e6e6b10e73733716e71ebf5a53703fb935fc5e02.hip -fmha_bwd_convert_dq_d64_fp16_b64x128_group_o2_psd_deterministic.hip -> fmha_ck_autogen_e75c757c67aa23cb88e1aced6fcf36b7b28391db.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_2b3af90387f1d227119c5dcd4b71362940bbce52.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_e3015c5d50481547aa5754d042d9d7040cf1c7ff.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_a4700d87a19a173e84d64e43cffabbed52366e35.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_6af4c15a119e805e4407b184625f57966f8833d9.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8b17c082f249649eca733a8f0cdf9a1205c3e3d7.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_226662cf1c9900a4334d2cadcc5f5ac3ad355f05.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_d723b191785c97d284675f700a7baeb52a2eb791.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_afdab954fd111ec48721f25710d61c0c8affd8db.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d54ac01458df3f240e0656d82330f9de23ba9651.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_4ed6da5357b67cc28aee4afa9523adaf055c4e32.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_8c3bd4e029bba76ebfc79e6522dbc8ca0bba5dd2.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_dbde2ef18e2174ebe13a6e7c8c2a6b05a6612047.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_c363ee1b087f6b504a3dd3972b96e77db02b0582.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_a02a71fdd587e47ee68e0cc76c3c4494ce06c359.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_968fc75a7d102aca068e3ceb6111728c280fa837.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4a06b5b153ea6e8b1e20d9aad9d4633333fd98f5.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_bde24a8dbe6add6f2dd2beb48b1280f3a84a9b2a.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_415b183c50dd2663dabe3eb8b780913b778c54ab.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_258d747083272ea657604ac84867ecea17bd65da.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_2a97c457144cb63a9c6c3d6be613b47bd0df9928.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8d7549e66ef309e32779ddc2a1f14e79bae53754.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_5cd41b6f578f3c903eb9d58ebfab62eb296044e0.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_dc34b6ef496d4e0d8fbbe10731d4a7b1c136c036.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_4a9f3da698a6103caf25d785928dd9f814ac27b4.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_aa996b9c843200a2ec33ed4319b48106cd7c6384.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_1d02609fb803ea2697e2c2cef35e6f923d2578cf.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_58eb2edc7738d8d18ac359691da261ceaaf71788.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c35ea54eb6cd0f3756c462c66d9be956279b46ad.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_0f0c699d9c3b0ed62097e38ba05e40e815cf474e.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_64fe2db75cb20428856b02cd1cc8d7b393a6ad9c.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_38b94d76503e13c911781169fbc378517332c42e.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e0966fa1ff013e477b1706928de6cb7f8587c154.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_b9559dd36a0a4f5e068a722e285f485137bd5ef0.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_5a05b4e7782bd0e29ca9f6d33fc59d4304136d41.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_b9385db12001110c42eff6aabad935a69ad3afe2.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_c1f721a330b2d0fac13b22061616d7b10c0f91e9.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_37fe04467e87ec2110f60c7aea0cc9bf2ca07481.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_d4b99af9a573df50a27fccbec3fa8e350f1854eb.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_20588bcac681a5d69f252d7523a3681a0c6b6181.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_a3709e4fc53d2254a03ea7660b8c72d2f47cf1ad.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_47fe73f04cef91cd2a0682e905483968ff80eadb.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_ad9b99a194b59d3149842c15733394da275b12c0.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_062c8c3c1cf6c33af4574099e9b6ac54a55ad776.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ab1ca4ce061f7f69a250356f613cab00d1e2ac71.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_cd4efcdd12184211c74e7b3f2f30fecf1041ca32.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_1d0b822743e0205f60521d38d7c64f589fdf0f58.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_e10f47a44400de385ddbeb99475b717c5646fb41.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3e562e6c3af28b8478020ce3c3bf73c036001c93.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_1a99b2625adffa8215276bb88fc65bae944b846b.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_56cc4399c5567a9495f17d54c712cc9e65e57521.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_ba8b09f0aaa40a7c9ad5f0458b460d3e328f3c74.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_43e7c78e8f65be35e2753a0ad5123118555c56b2.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bed5a8c5cf683f6dfaefad72c2e2f5c2f2b2732f.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_73ec21ed6e040260c4f04ef68ef9307aa86985a7.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_3642b78913a853a62dbff8b99d9ae3fa458f461d.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_ac5e9aee85cd16903bf7b82a4ac10402b0b26e22.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5f954a393b7b5a7131c13d0c4578443f468a738d.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_78e945db4afa1330fe3978bc1bc9ae99828ae287.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_d4aff499ad527be5fe33b8e92547df57af26d40d.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2b8169ce4b4b9a17ac96fbb232e6a93f22071ab4.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_0a89417a043556970f72eebd48b4f3e7ac15377a.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_4824e1f8cda50f80988857611da766685da94494.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_dbae1670fac6812b2d2cbad973e4b475509ea504.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5daedab8931f2eefb649b91e80145cb71b63360c.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_dfcd68acfca68d1acac94f493e25be0ef20f209f.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_3511c54e6a6f9eec378d8b661121066536195d3a.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_deb9ec2cccab94920e40f62a1f0f094acd919d07.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_0fbb0bef3b388867e75d7a8a187b8b4b650a42ae.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ab0be5a2072b5e87f5ee58149688796b6513219f.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_91a6200e36944b1f11106c02f7fcee053f01ee71.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_1f81f8cce0d77dec9f977b9eeb0778b70a13fa75.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_bcd7ccdceb7baf3b986f2a0248827822a5f72e47.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_58762476c7f2bb05dce92ec22c0acbeb03676746.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_f4df1cbfbaf67705820f125b474469ad7ebab0c0.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_f42cf0e5fe479690883507028748b0cd3dc83cbb.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f682399cd6412fed6a1141296a7e4d42078f7b29.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_256ef175029a43e64164176d4eb212baf9d27bb9.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_3206cc121ce8955ed59ea3b12b858ee2e0cf82f8.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_d1840494c4fa78ff399c0399b3ad7ca3d22d4587.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_31c4b866692ba5c3d115482bef4790733863c1fc.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_b5c7fca1f76a31b0390e92d90d569fab94d4f783.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_dc3d625c5ad3e871f5a727ac946df642d988b9ab.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_ca4c6ad28aff1976c6dd36974ec3b339aa3090e9.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_eac353f963c52624cf79e82cc2b2c02eed94b677.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_29bffc159b0bb826ba489ae763dae141bfe8e802.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_9b327f0fa1155f2235d76be45cd22e3db5a69429.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_d0dd0165ee91c095a19ceddf08789e3576912590.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_5344427df3ae9392c4fc4c25c232196828e70648.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3f7315955f555768f24585a50d75e216c40f062d.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_dbcea8f7b5930abf76eecefce92d0db785d2df5d.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_165dfb45658df8f1ae8dc0738ac9614740f2576c.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8a58d4bca33c4c0e79141a56688049237d170d1b.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_fe9d98dbec5096a89b116f85675af772f023014a.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_d1c0dfd19a08d61586758091370acbdc6f267017.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_960ecb3013071fb65f2d5ed4c947c4bf303e5308.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_1552dc38d26f6badb7a9bcb5ce9124d54cc45ed3.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_3af86f458fb4dfcceb7db3357fbae0dc15142a15.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_74ba59d347ce8916a22b40e6f22a3c89e13db4d0.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_7344f96bed2f56793b1c2583485aa161cdf30379.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_ad989d2ce769f20e175fa88f4082c1c25fe03062.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_096e888c52d0f4a5847d7515fcc66208b1ff40d3.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_7cbe4562c51d6829ec5942e11035c452fe318b3a.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_621da34ee666903307d3a09b7a032f2a70054759.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_c64f4cdce32189065362a502105c31bd2d9d99a4.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_987f00dd759d9714693e7517dfaa8bb427294d42.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_1c2a2d78176e3f0a78e3ad78217e75a4430c0de5.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_ba145535e53899fe127987aa854f81234a9c51c4.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0968cebd81ade762c2f92fffc0153fa7a2b91eb5.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_b41735d250b5a16967281a5f07873b9cde3df4d6.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_fac5a0f98b94530befd634891e42c424bb86f0e1.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_ffb8adef0cef91a86f36872407fea35df90e8f2b.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_78e1edca5abe1bb3e7aa946eab6484b7bed806a3.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_88ed7f650c958a644c8031aeb88688b1e42458e5.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_ef2ebb4a86e7ed0001de9c5e607b66fe8877409f.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_f3ff73f82aee3184849d04c2364eaa45c6d0de9c.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_fb9477a613665cebcad781389ba7c5a36f51efe2.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_21f860d42fdc2cc6bd743d53ba546e332c22fedf.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_55ea83a47c6299fefa4220ed88f7a8e1dd938215.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_80987e2d765efc320eaee813607c94c80ee35aa4.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_288458c5a0720ef152848713119ebce6d76db6d6.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d6149eea92f2c40c11de3b778102fcf9b6a006b8.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_7b5680f97836be4a369802e8115617a83875703e.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_4347e039c003489dd528faf5d710e687321a3fd7.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e5b2bb9f8466de1ad5210e4c39ee7b8ecacdffa9.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_bc6ce17223d8d83a64b8c96ac88223e4441a4692.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_fc1790325b59bd44b0a5f6cf9723a25fd845cba7.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_662767e588220d0dc6137b00cc1d8dcc91e97134.hip -fmha_bwd_d128_bf16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a3dc780b17152f696f9b957432c2eae8fb16e85e.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_9a8e04fe9432a60f86ff0369e8c1851821074a04.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_835a906031a258c6362313eec783678bd8125c91.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_ee8e709eec7aef1fa681053c6d2969a5ff18c45c.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_8d079c1eb36db8461fa8b861c56760afcd97cc34.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_64b3488ddf3bb1a4870371882f0a5d267bdfdf73.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_ca3975efd767ddf7c12e308d948bdcaf0968493a.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_82ad0c0580516485ea432d98f53e73f6dfec548c.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_4306c6c37cf472ad262f53941611b5e60072bdf6.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4904c5910a2d0595b39a3f87652a9d1ef4fcbe80.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_f57f84892e2a8496169b7406e63b0d4f5aa63aaf.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_f24f26e45d5cf567d29fbe375fbf8abdec39186f.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a85d35b2fd98742427930eb536e346ffb005edd8.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_19df4e13108e043361e9528b71df56f04f696a0c.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_dbb06b43d5d65429e23cc717448cf1fffb0cfd74.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_0ef9b9413697d6f4573c6605bff6f58d027c5016.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0b2efefea81036641561bed80c75d77651176f74.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_27c2000d32c230a57a6712f27bc0fba02722f5fd.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_ab1d7f93427095e39bfc1d986b3d7fe54073ec75.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_7dfe21ee27f8a0ca0407ef0dea73cd73ae6940db.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_8007bf7ae1b71bf8ac4a793aa519ad333aa7a7ba.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3937d9dfb68351de2942e32f35e2ca1ce71edfa8.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_de1ff66d2aeb47d2fdccaa4bb6b9d066b380c99e.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_5403eec1cdd216d5c4a7ba977e2ef92a0d7fcc8b.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_358399e756ed5026baf3ab78af17489dc07b9532.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bd064e302ff5b983dbdb4ccf51383fb29ddff44f.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_c11d68fe766fc753c657362673704005b538660b.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_fbea85b766bf0c918ee0baf24dffc6a5563d5105.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_92f9ad0fb65638cfffb3e7786f2cbf01d9585b23.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_0a55ed15ef58c941e06dda890aeb530e28eb7bba.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_df4bb75ca79f805a81fbad750ad22f6d22b0d8ff.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_7ab03a62e064864e1e9c1cd506c1b2e1786a777c.hip -fmha_bwd_d128_bf16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a189292c81a18d21a2921ce6740f81ebf4c046ad.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_c9312d7159369d13f3148a6f0882dfad6921ceec.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_0cdef49859c80c6b3ba18eb2fb4c35c72abc1cf2.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_ae87b1d5c50606430b544ed650d87df24366e7d5.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_0a92671b6ea99891c0d69b1c793f4d131b9a82ed.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f4a6438394dd3427f29aa0bbe58ad1f797c3c38d.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_fa85f869a92f0482605e52019828244b12e12b44.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_c2541b6b5cf27de3f45f60671d36602f07ce1783.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_0595316f0dfffda03e5296b959a49ec3f3c48d67.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fff7aa57cca501f221077124359a589b3a6f9d0a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_358d28c958c0a831a615a4811d13279b18db09c4.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_96f1bb85dff8c97846f6b2e8796a6289bcd0d9d3.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_14d4630876785655bd4950566e81ae0b645c0d3c.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_a48843d844f78690c7a45b730652f0f763c595c7.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_3e143d88eaa0d9cfea856b2f3a57d1275a656627.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_18ed7195a9443c84956c3f32839cb3ab9056bdfc.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f7035f4bfd8f2f427720a07e3c311bccc1dba683.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_f87790f260630f312b84888dcbdf849ce130ae59.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_fe97b7adcd67ed9bda8831d1f3f1ca7590c6d251.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_b41a30092e8138877c1f6c25656e0f8ae2c2444e.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_af06c0dae15684f83e15722a4c07342af9ea011c.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_158d5ce564c3ae1eefb54e3d41dde2604560ef4a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_49f5017cc0f5c8c8dc71492e7765cf729c1f225c.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_280bfced8745fbd9266207463fb41476dc23afff.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_eca613eaa8471ad7da66d2f8f2b8e07f6e02b467.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8e1b48a28b71c7f4c78eb14321b39951a7c5e903.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_a1d6ad9de7ac7993ae1923a2ef070b7dacb8c563.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_04641230fe9a50a221047f7a1df8a370f72805b9.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bc1ae1dddb8cc5d78196da6b26ebe66c1ce7e567.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_e8d9b65558398c0c10127b560807578ef117d7ed.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_87e3a06266deda093bdf28af82d8666066157fc6.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_0a672fca51de618e3441cf8764e8e83eb782f2c7.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_92d841e6d783bb46d841aafd9027f92dd1b61b88.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_01f74764c3c3284fdd1b67d0ea781c2261ed0de6.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_feb5e77111fe1e20bafdb83a925b5faeeb6214af.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_26d77b228420a3ead919474ec9c6fb2800f86890.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_4fd34faa8b168e2ac7862641229e6146d3e28aee.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5724d91c1fd6290a6cf8d52a3801ac6b921dc7d4.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_dd11806cd2d3ef1127f676b2d98bf8fff2a1e5ab.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_aceb0641213e9a45ba48bcf72bb23845720d8b79.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_c0338fbc05f86270ded7df2bd3e2758a03961b62.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2e8b4260626beeac76c26dbcee3cba1457b30e99.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_4e0a88ccef04e81b8c684b695f7cb4310e448915.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_6f31b3345893eec8ed1ddf1d8de2512b46ff6187.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_83d920a76114c63156740ba5dd6f3846c4b21c28.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_86fa51b8c7a2f3fac5cf4cd2951ed2ede5c35450.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_e7b2eb64b66d46359fab44333c2c484f4c9dd5de.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_81acf1d17650712b71a499bb66909bfcfcb6aecb.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f1ecc90ad7b86791a9e6f73a582aeff30f393804.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_f01468c62c878295443981662e037ec5213cf7a3.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_e2deafd2f36cee29109fb824e0135407453adcfe.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_b1766695dbb790bd614b83dc7569ad449404cc89.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_784c35fee4d372123631312f1051c43e1fa12378.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_38bb367362fe2c4849ded728ec5dd00969ce188f.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_9afe4b6f3b901ff4af81bd4f1cd8ff19f09d0b07.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_9ca3b1d36d777213eb381b47871bf15dd163c994.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_d5edfe3e3dc3008b928c8e6dbd50784b905f189e.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_10c24f1f9009e46afa3a59193784cc2575f79056.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_7dac5d4cf103d658e129673549549f1276f134e0.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_c8dbfaffc8a9b573f194f9c63f1175d9725f8950.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_566e26d4969bc6bbe9b092bedab11cddb3360c0f.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_0ef309b923172f4c0fb38d9b9f5325b33b4877c2.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_3bb3b682eab96e4e173affad75b9d8e73f1dd690.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_f92e9a82c879051d6fe3c42108f8a574187704af.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4f44435491aa68acb3217b0e693232c67641a2db.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_6082d55544b5280b49b071ea277fb1827193fa2a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_81bb8f13b6f20a72c9ce6d0b53f81eddbf05f1c6.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_1e42736d4f677a59a172bd6f162616a437696351.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_b9ed0a64deb55616646ea98b21a891c971cd98ad.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fb2fbb135d59028afcf867c2cf08edc323565528.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_6360621af3f7e1e81a8be48fea8d2750fdecbbf4.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_097b3e1dae9bfb2e89398706508f8e01966fd4ea.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_4409f2a7deb027e864afdfc9975d3ab93c5dcc9a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6d307974bdeeef95cca0d130ebb7aeb77fb1b6eb.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_01ee0083f6df962c4a754cd3295b1a436c590a0e.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_c0a3c4ac0a50bb9b7ad764929dbee98c856b1210.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c27b3026f1dc3056dee3a3e64bf31c45683607c9.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_5af96b404feac271dac8f4190180754480d3ba80.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_f69878f4ca8cfe6b8d8748766f66a1ef8eab20ad.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_8689126a7eb09d81baaf8f99dbff8932fbeab3cb.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f6856ca950bcf173571766c3f04de4163be0402e.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_d036096f49a89730f8af7e75457c88cb8ae64165.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_03ff035717140f7385282419598cb4fb2881ce8e.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_de85901d66dc04b1143bb6404445baf65693b781.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_5c742b9ac6749f189d597ac97d46d35189472c50.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bd9c47f3305e47db6ab6bc627fb3d80269633074.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_d82773721479613ad72e334510a248f1436b38d6.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_cfda56a4eb08b803332f25bda6209932d9624acc.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_328a311bafd1c153525393b252e4170f8aafb370.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e5935fbda313d3518f142f43d46f56c600f69286.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_48e9e858abf6f77489f3fadc4ee81edacd26705a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_f71f96ce4dcc7f789a8ace73c230c203b05ff6dc.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_01d12033d59ce2799a2a024e5d9232325ccf1320.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_5854f09511778dd1779a839b0b194896070f69ad.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_7237ce5f3cf13ace3efc0b0227ae5a8c1fdfce1d.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_3b4ecb47f9ebe8c2784976c3e9bbe4834b475cf1.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f18c74becc24a93427d9c0838784e9b6caad6e81.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_c4c6c405cefe204824e8fad1b3dd34bba87e796a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_41db3f29d1940e59dadc357c040ea37a6ff208d9.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_df4c9eb48da49a61957537270d94e56cb4e426be.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_6018ab272d7306689c7dc5a6d5326efea1471235.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a421c2ed6b295c458071f1988b9d6f7b46e8992c.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_61a44ac409e914c12281f1d26e5b52d8bfd0df75.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_7e332a6aeecfb12dcf70c69157fd3137343fb9f6.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_2e43e401abbfb1b6737e4dc822f68421abbc648a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4afd02981f92fbef6277c1985cc479c12bae9239.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_8513d96a66a4d9fb8dfc84afba7e1d8c200248a6.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_c4dec99707511cebd9188d216ee0a148d729b470.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b75843bb13058ffe29251e053800c509c7590544.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_6eca9cd905ea8b0454cf9564643894682b08cb97.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_c4b34d3cb673447773f6da23e9cf52b98e99f718.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_fbeec221cd63adaedceec39db41ea942f99f5133.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2d7b637e0313cb423b22cd8844cc2997b3ff73e4.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_8fb224b40a7be7db0a9c5c08cc5ab05b526c14e8.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_e28fd64c2f2b27577109a984e6ab82f5f0fcb296.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_2eba937ff6d0302ab013db7349d4feb914107f1f.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_4e79dce18e49ffe024fe4cd0693ad3399f5edaee.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8cdcdeb845e7bcdb89ef70ab2a97157d4db3cb52.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_30024440e780fdf9ec94deccc85216d8bbb5788a.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_c1f40c3421b9ad8cf43940530ec50bcf620058f2.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_7e89f79217037e361bb0909d06534e40f5026b4f.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_44564dddf8b492d80be54854abb8d1d831e42679.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_7831ce329f2a0812ebb1dd103ea4ba8cb7ba531d.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_b7a03ab0b7887cc7ed0cb40e56360a8d36c0bb8e.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e986d5f8d5591f3e0f1cdfad19c38c420fd93023.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_076b3beb57b30afb30636f948e3989b346b38d20.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_2177d95cdf45f6fec95d1812f2ef183a75259e38.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_ff6862dbdbb20bc63a650e1f93e9ac169bb702b2.hip -fmha_bwd_d128_fp16_batch_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_649336d59a8b35919e593217b6fd4314a04ea359.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_14d11aad7b666f500f68b264a2fcca6dfc5f1a05.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_4d5f3cf0f78f73df79665c26b20b0805615e1b04.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_4bc48576f285325345fa1205e5e7e01787b74f71.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_e7c0a99e949baa5f3a7ee2d6e84427982f82f76d.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4a2e6b05e7e4de2cb23d815f8b2c8adf22131c0c.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_0842c4e3aabdf55405b3ce09ce1899245ddf11ad.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_9ad1f99284aafc8d7908d062f179a056eb314925.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_b80d0828ba6d24ea3c1a97bd9835ee937b4b32fb.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_847feaf237911478173377a501ee19ee325b012b.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_1a8da3e6ab050262b659c801ccf9a14787d7f176.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_0225857454eaab2eb664aef7a0849ce12c32fdf9.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_80a72d70d80b66c19e85daa00497308381050048.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_8b9043572cabb65435627a3faf23b18d039bbcd8.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_378759ae25465c32960487375828e23c5f1ac869.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_83ddca2c6ecbba4314c434e7471ffb8fa642f936.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_40db688a9189e1c47c300d474df946a248a63303.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_ad091c69d19b27f7ad50ef6311532ad8b642a9c6.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_5e735b12d130ebf849ac5d6752e413ecf3e69fbf.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_2c77bd7e89ed832cc31b2995566a49bec6e4cb52.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_133c51948cf8584900807998da14d788039f53b9.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c29110dd501853e87ebc122dd1971b0bb1bcd92f.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_9a9edbe35a8fac7796f00bde836bd547044770ea.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_ccac6c0e61b65c9422c7f30fbd979031698370a9.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_0d13a4c8d169877da6408584dc1f20a6f7c5e3aa.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_744ec604c577a27e0aae5b39711a9e2eb82801b6.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_43f2156a04b18bab55af60e9357f28d8a4604e8e.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_dc9e54273c0ea2358fb573a7d918aa7b09fe07f9.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4f0aded9d1baec3125ce8e176248cb146ca580fa.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_c80dce1a17d073259250ec0c87ade69e639ffa8e.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_e307a1b0d5a8f94e0a0f4032f401d20b4b643523.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_05538339c21c92c53d237865d72debaaf2ee5075.hip -fmha_bwd_d128_fp16_group_b16x128x128x16x128x16x32x128x128_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ec3deb1382003ac010d9bc1c59d1878d3ec7a727.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_1f7faa0b33a9aada86f032174afd40d18efa7715.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_4462b192a64efb60d5484798526278ac7a0fb9fa.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_3a2643099365d0903c799585f41dc1a525ac9f9e.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_555ba79201a585bc091ccfc326fd24e851d1eecc.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_153e897098539c3466da9d7a37234daf16476277.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_38a5ff72f22e0ad040a281e66b1aca0bf3a2aadb.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_4b2e7f96b095ebfb66ecc7a75752fba2a63e4f37.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_0fd4068ea93fcf4df463e3bf3a6898d23b65da7f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2b823c3b99e7c8d1cdc39a5dbc7365a383bf9ccb.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_3824e97d5ecba46e06d5ec1a9456c810d80227a3.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_a5d4eb673bafd81e3a0ee213da4603d88b8460ec.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_40aa64439b80ff8dd12498b3e5f6b625da16e285.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_f3bf7ef503bb026258b3ec3d82d3ef1443046964.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_556cd05288e1666f5c67fb87ad02ce660e4c589c.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_fc030b61ae20c4b7d9b2d10930a17e01e9e93328.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f069b38b26c30bc770f74c856e47eb498f5818e7.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_fc7b0916744b593435d8e1e7b6d874d760cd5e3b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_abf92a5314fd33491b5eb6ebd2418b7e0d5db774.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_d41b6a64dd181f2efa65aaed03a3d229b3566c1d.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_80bfb0e6032892cc58cef4dd403f305a5b76851b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8301bfc0394936a68fa0098580f06e77c88ebed9.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_e9b53fa68641f45baabf40b7cfb8b35a9a1b9c7f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_c9fb8343e623e46f01893a2b61345d1ca5928671.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_320a6196b662a1d3dc7441a9536d825dc356b95d.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a3d7aa46528ee74e2bef1e87c1feceacfa55e173.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_c59937be2b9a13d6520fdcc922e4e75c9fa085ab.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_e477abef05ff37ec27705eda51896e2aa3a04966.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3da8c31f6d5bcaacfa4a21aed4d1d3caecb48922.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_6d40d762ed576832b3a752453e9881b5fe6d2650.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_3c1454ffc1418dac641f63671e947d9f550b1f0c.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_96c129dd4c798343d6f78ab78056f0faf2f1c9d3.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_242013527a0266ad479715ee3e6ae01c45de29d0.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_2dfac5a83def98340c8786d55a30a98ad68b9eed.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_ae51b30c7e1cd30e550187458350c8db7c59a9ef.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_5e0abf4e2b6be3e2c555c2134705b9dcaee617ce.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_7309c38fc8a2d5ad6efd449107dc54a7509624fe.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_245d90000b55ab8b6055b1934880fc6c4870b34b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_0b9585ba1c10acf67115c5899b3546608541820d.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_8e431313fe082958d31b68d2fd0d61df0fe56736.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_1db03461737f1e359f389a8d297476f9b60faabd.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b4b037a2e262d11d3ed7d9feeb41b9e05427a739.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_c919b8ed877d4244d01a17ecb948b459e361ff24.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_64cf03c0aa3f1b2a7b76b4e3418eb5063b982a29.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_1386cd75411e61a8dbbaf2b916e62f4f5f99104f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_6e8cda718e10824956f0ee39bbb0891eafa45a7b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_2ea394a09c8691a534ad2219bedf73724b6dd5ce.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_748a3d76e8ab73af9a5d2302d33e3b1d1b866dd1.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e907e8d1089557dfcc95a05160be5092e9119a53.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_c4c3425fe683d35dc3335db77d183ad1620b7a92.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_d04dc4ed02eb42c3fe303342801ed3073a0dcb8e.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_3ccf0a9d5a5451da5dbf6075ccea45e4a140550a.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_d924ee32b178b6bffa7a71603d6e2818f66177a5.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_aebd5fed34ebceb879ae3dffaf58c7c04ab5fe80.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_5939e6610e41aff8d1ccdb66d9e84d3e48e8d379.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_b4bd2d206ceb237ed2c51f58abb5cbf96e39d07b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_e56757fb17f5e94a6ba1fb14540a68c36d571159.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3076a6de0e2612279e0ed64612f7393856bcc9ac.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_ea6a6d4cc262ea838dbb83ee747112f95fa297bc.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_1a6bc2762b95d550485aa720edaf71138d94cd07.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_614a9f10ebc51bde3f580ef527c17f89489c12c7.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_0271bd8b7c270e1593871b638288a4923342c446.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_4b74439f42140cdda9bb0f78d995d741212a35f4.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_d733f4c03e338ea7c6d8f759c1132499bdcea059.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4432c5214c4d40c54ca2d02f0d4785c6d6902370.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_1f13a6d0f8c798c0c4ba4ad202d081899fe081ab.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_a1c71e7d33f0597fe090a3524e33e18b2e562680.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_e13b86fe4e153e0bfa8d1e75f3641fe32b0c5149.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_adae2d4f8b2dac799e03ea6f279e6ecdf66f5381.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_70586668a61ab88bc46b763df8f1c2ea52001ea0.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_82f0f3d71108dcc49234a258f0f3b21ea2123cc0.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_1de2f97d49f015b9af0b186801e939c6f357a0c4.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_9dc424f0e192155e3c4e786e5b87d5a1a3e6c4ad.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bc744db85d4237ee9640f1658e0caab7648e3bb6.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_e8d8fe5f4f8641998b8b805a20b2ca92d019ee59.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_549b6956eaf678f7eb901567d1a515eddbedae5f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ddcb1cfea1b0dbe50a02252cba99428fd977527e.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_86d73393d0d8b769f30222f7817563a955c36dfc.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_249668a3212cd00edaae871758be30a5a1fea589.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_643b3798f11997d33ccb58d90ed6c10d5411b735.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_adda7ad787524e3e47dcc1b65c41b2faea38f55f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_5d7ed4c885fb32a0b548186e56d64bab98071d30.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_77a814291d8f01870274149b9d82fb75921d6e20.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_f395bec57c3b2e6e169134dd8d20b287d7405134.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_0f588dcb2ef86677ebf84e406eb802e9921d1f1e.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0caeedaa7d50f1741d618fb6c573529eebb075b1.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_1e33ce1fa113b221e5303b4093c2c4e748ce8298.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_ee974931e65d6b16b7c868d462b95dcae20b7513.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_85960fe542635079de5eca3c7785890cd4740005.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_089de13222caec1483207d4a54249f8da4f9c151.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_ffb5b7349a671b182d73c8016590f26fe06a4cba.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_768c80fd3ea17813df1bf19a158186834fd00780.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_597a0276ec419f18f060a5186e6bb703ae434ac8.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_fc86c13e933cba40553ffba31d53aad27415ce4b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_31c3760f5978baf9780ce4587ae4c768af0e49d1.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_92b0770fe64e3c60b9e56170aa88bbf74802a813.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c487a1a9933239270f44b1e08e1cf5323521c089.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_3a1dca5feb864e8981387c2d07e62acef1730aa8.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_96caa2056d99eb67ada498e287b4fae984397691.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_683e8a33fdb7053760c9c135002b0a94facbe015.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_7726be8909f631c04d4395fa4ffd03a736f447f1.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c197d1f050f42d82e6851fa286db6f81ba197f40.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_d3a23ded424200d0c6f06b1dbd0a7b7b0e7b5d9b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_6ff4605d82507fc4bd6e96095eaee5173ea41973.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_71e3980331dc4bcec6ab6f4c345c7b5f71356979.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7e9c7feb747241c9c7de2adf3a19933a1c4c0995.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_1a236be9da05a07d11cd28034d90cdf89941a172.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_ab0c3fe9529e24327686070731d0ac3ada76245e.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_66be70b088b20fc8de464167c35745461ddab640.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_fb4c15452f9155c5966990f09432e5eb7e28e785.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_17b9b96edda151072215502cc2b606bf1f6f0b03.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_f36aaa63ed42a578b953ebd614318d44cf44e8a3.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e578ec9e09d3b78dca6b5bf0be1538657f02f319.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_09513bff5c1da6aadf11d2e8272a422eabff21bc.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_f020134822739be6fa0bb3d98e9dec79f025324a.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_7a13d62a715fd717f0d4101f787349cb49cbe70f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_d40569ae9dbd693c0ab3d6ba69704d31e451011b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2c808da5c2514806c2953bb77d5692e5d7c97aa3.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_bc79e255d25744725e2a9db9f90d5cc2b8a0e0c1.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_84dc4af43de08130a04bfa06df9799b6e9e96900.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_006c417a52a1bd7c55e45d111483d26f4480caeb.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e02a198f23c409b715761b702d7b0e6e5992701f.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_468a5f057fd5cef2df5f919f5102f47e86901e3b.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_16047b5544acef40e39932672cac6f562e200948.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_741401abfbbbdf0dd1d62df8bc3e85371ead71d6.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_9009b7d39346537aa6c4a4e46b81139f603edb60.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_75c38912947881caa14b3fc7ab7bca317e296dc3.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_1e943fcc2e64c618fc1415b3f1a0db4d70aa8494.hip -fmha_bwd_d256_bf16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6d470f5c6fb81032fcd7974180297d4bb2a8427d.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_aa1041530f794c7b8dc4a8321ea0fcdd338fff35.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_ec9f63a538940e5ace02ae5b5ddc01f730adac4d.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_459c8fb6028991321b09a990c2188d854d940268.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_a2a715b7e9c1a576f011dfe5769c5b392e984f82.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_64c3c1e3dac623f07c2dc1b934ccb868cafcb38c.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_ccd0b777df1328bf24e070ed4cdf8615bb2199fe.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_6dd707cf48a17d31abef94215c5720419faa0a39.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_687f4aaafd1a5b9ee85aadc6fab79ad0c27a2ea2.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_aebff7e6605b273bad844b8f70ef031625bff48e.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_cc127a63d56099e08125b16939dac82f0173122b.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_7838849e57ee9cd292e588f587a8079b57becfc8.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e638053e01268a4c5883620fc6a9901951e2e01a.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_405e7efa263223148318ae96bd1929b382e994e1.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_4c69d06e3f32e3b6d28d3e54ad764b472741c193.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_64a0ca185449a49fa485892fde6af745ba758167.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_af6ccfa11add1ae49888337e84d9c446d2f67da4.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_a487f617c4b84c6a0328fedac750d41dc3dafe27.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_6e6a4475ea795935f4cbf2dc0ac156a33d754587.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_d95835bc6f000d3a3379bbc38d90e83dcaf867ee.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_2c2e75e6f659a500dd3cf2cfd65118f111342119.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_28f2e2b108a53308a0cb6c123c8d318cbc2eadb4.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_a65c43b870705c780d734f9ef063f55cf8b3b52d.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_8fc08b4f3959a2375ac03f40c4ce12d70cdc2d80.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_a673f35edd69241c6b921d6712dfd064d78ecbad.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ab877ae2a1aab04498bf2b26b3fe99d6488ef151.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_7601e6aea44b96e94fb019501be6b102c6e6a654.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_4ef35d82ceb4af2e07719c16109c6d72eaedce67.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3c64c33870ebc329921cfa3867d58b1857421f65.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_526c89b7a04758b4badbf9695b316f877b8bb053.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_b3da22d3482738a8474ae15e8e5fca9020c4e195.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_f672bf80a78885428b2c02e522426470653a7351.hip -fmha_bwd_d256_bf16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7c19fc90e5a9c422dbf529d2def286f47dea0f50.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_76704ca28a4877a1e84022e022614709adabb280.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_0029076f83a3dc695a167beda6fe19230a2b114b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_da29a515d14dac02066bcd4701285b9916b43cf5.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_33e7c1e5f41a451c7baff54f7238b220f1bdf8a1.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3c38bb80e9880335faaea81985ed5d0e713ecb08.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_77d0223697ed41c4c2fd8830f8df6e5620db547f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_987a617fae00fa90a1ba60937b0312c81087c19e.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_1a6785392af35e27d6697b584cb6f17a766d3fee.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f3fd08d56f8a9be1a8dd104cdb1ac58e283b5064.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_73d4901b8ef034590314048de7223a572d61ee0f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_0502e718337eab7d47aa65cea7d3c5f641484520.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_618031345ea71cc17e458eb97a559b7c94d3ae43.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_14c4ebd1792c781d219bd21b691b575f64635730.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_56de9a7dfb1201b56528740e9d8a07b62710fcaf.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_cd0453a5c3828c1358360f31f5d3b7258e17fdb9.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4cb1861e31df98bdfd731efc3d335055090d83af.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_be8ec1163a01b9cd9a802d8b44669e8770c20234.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_f0cad48d9bc80d58705ea60eb2dda4baad68cedb.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_ef7cc2aa1ffd38298b52764a93cd1271b4d92f8d.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_3408103188e27b3bc55dce0c1716c0b4d32d6494.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_1bf767e7104cfc8322f26df35907fbf04b8948f3.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_9594816877815bc0294610ca24f986fdccdc7c6f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_d9061c204d8a85c974676f4438994a0be9d69a60.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_becc2a4d7ac045365300bf8bd45fc6d3e1e1c8b1.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ddf5339054f47d9ed6cc7f9e66ab21ce3bccf3db.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_b01dc872c24db4db0c9179fc07e17f41060390de.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_84e8ae99e184013739019c93d07caddce532382b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6a66604bb15f97a56847a7c968dbe32d247cbc13.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_90e5c56e92712d00092ba102a5eb5176a3e5d471.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_8352031044ef2e4a22e27ad04ab5d2c02121faee.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_7dd260849b86c46b685955cab54ba07d49b47954.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_afda8f46b5ded4c2aa9d722fec17b75004b59f7d.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_98e484adeddf3394d8d7693b808d83b64c71ee69.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_cbd571f4fe576fdb17d5f75a558cb6747087c7f2.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_378bf438642e5d863e31145ada2a0688059aa5d9.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_95530399ad7b43d8ce2c89da24c71056f2146b18.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b00e062055933388e37525df5766f3c14cd3538a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_236b3eef02b904304348b9d35f715b639d63218f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_069c663be0267c009be4814e9e4e7c13ec999411.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_a017be7b8bcf303b30a147f41346898acc5fab7d.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d20d45aa85c0daa299da98c277cee826fe67bd27.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_b34c1ce348c3d9cdf6bbec9758de9d5fe94c43fc.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_1c1b0f85e085dd0769c566fb16aafe5ab5952714.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b513834918d5ea789e2db21abece7c2d3532a7e7.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_0513b2f3bd8ad51315aadb7f63737201898adca8.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_4bd4d46397a3749646b232b306688e52b8c6e584.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_f12f1f1b679cabab04218037ef370d2c7e1fe332.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d623b36cc3f56d1001b2d3abadd8a5628fefd014.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_3f5e01b4f2ca8ea10898c39d6570bd74e85f46ed.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_a5bdc110955c05c6c6ea236a6f60266a4a6dce5e.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_70c8e45f6ea7cf5dba9eeadd0b19481d9f5defb7.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_b5371415448fffffd58bf014dac9f4876153657b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ae4e80cb185759dd9b3eb3c67c239964b3694caa.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_096863cd93d1b105a617d0daa1d4f37d7fb6b893.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_ae8d0bdde763e617beafc0365ec4a3cd11df6c55.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_f7cf08242b3fb1c643d4149bec985b667b9d28fa.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_44c181996532676f2140fd026707135144e9d37b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_8f6e463eedd3e65b9c79feed3cd92ad8cbc9f036.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_9638c9618dbf2af119e37596f7eb0fd3f8d72748.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7f80d44e82e601dc48d4c8b4e710ef7265894b6c.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_85908fe6dc9c629c82d6953081b10021e64583b1.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_fecd7501265b4c4dcf015485e63e2324304f70d3.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_3b508b92f7e123b21658f6e17d624ffa87831fee.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_01e2428c5447aa9a78f79f73f31cf685c586872d.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_e088f0f7363804cf5403adef70828ab32d09a02a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_f4900c0a5c0d03dc17d7a907ab40652d9920e756.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_cb20538073888bdb3174a8e9c32d7449072aa753.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_6a3f42d5c9ccdd3807e488b00f02bc6ab5d8d99a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c9f1e7e478a2208c4d32e2d7e6abebdc16bcc5fe.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_8457ea5726149efb8778e6d90798b8e48288fc9a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_37ad61bf8427a26775969f8a9166fd0bfb7446b4.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_72abb25dba0c48b380b2dabeb6ab7efaa706d180.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_1a5e18f6333ed2cce509f07cb8bd5868951d66a0.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_091cb49c1958fb4342d79f367ea93cf2b472f785.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_a93324ccf11b273ed20fd960c61df897c8890b1d.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_906fa8bf5e992ddc25815486ae9c24d8bfba7227.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_6ef5803b33d97db72eb8a8528aeb3fc956a938cc.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_a0874fc5ac87a1ec487c7722bf3b1bdaa924ee09.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_e7ae1294b6dea5c8b93c2b814fa7460c4047105b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_238e4c1ca112afec494fbe47a85b553302c43395.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_ab09941bddfa9d61985b55f9b6bf0edec9bb89f6.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_48280c91d7cd8712fd533e246a6b0f758834abc9.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_6a95543aeed81adfb6d847f78212585a36122ae3.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_6767cce35ab784aa42ebcb75af7305bc38a8721a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_31b807c48c472e9b1311a6037cd98e21d6706889.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_dc4d27535b9570b8f4b790470a83c1d0a9a2b6ce.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_ab56e886d53a1d88fada0f10f00b9f398dc54568.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_8adbdcd28cb2f078f89adf9aad2b3d4a0a477823.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bcf8836c8cf932cc2748e313885003f0e11a887f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_2af6c5be53732eb1939a2f93232af7dc011dec1a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_da9f6e1d59132fe96709490af25bd794f267851c.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b31f56244076c501cb09b4b90975132cae4c4386.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_f9c58761c927b222112cb5cb6c9acb5d3c915785.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_041a0718891596ddac1fb0088637029233ccbe60.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_9801b25e0f132d647934deb395b62a3f70cc7c88.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6376eb68c550b50b9aea42a7a2cc3bda186b0e40.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_810dd4e870ceda3ba9b5f0084a4b025b2e609d57.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_a821661d8280c6e9d27f2c9ce1b3c855387b5a76.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_4be4a98f150f3f9ab6f03b5fd0968c5454565c9a.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_96dee49ec6755006d67f0c30c65f50558bba69b0.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_83d580a612af85533c87aecdd7b0345c71b75980.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_451fbbdc2dcf2ec81efce34673ee6c425cc16ca2.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_c4376ac8d82db1bc25fa273a80dfbf8b71ee5e2b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_910cb8bd09d287a1566265eb1e8894fe68d3cc81.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5b7a4ea3bb8905a22ae97a94c354b1cbe38093bb.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_da07d8b5666423da30a95e3b2cabd3839d200981.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_5bead6be6e39ece0e5d44335083336f7f546d2f8.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bcb6f0730fd09b4c6c60913425927dfdb8f83d82.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_ffd868d49abdb769ab82c21508d655daf54b8a99.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_d9c3e27b522320dcca5ee84fa534b03aae2bfea9.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_c323a4d1f24d59bddd20ed2f2fb6446627b0ae8b.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fa16fa84278b489af253b52839786f94aeeac36f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_bec9e4c0317e8d351f60258ed6611fbf365c4024.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_13d5f2ec83b3331654e37ea0b44d88cd98abaa37.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_fd614df484b263deae3b3c20adb0ce7b62eaa651.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_05e60b3ab7477f9edc8576a8bf43e3a62b8d5ef8.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fccabea88b8e290688c1b360875d228e6fdf1624.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_486f6c7c7655c34b7b9973ff357b0813f0a3fd7c.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_3cd7a9ca49c1149d46f6b05b0fefc41ecaeb6ea1.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_5e62968de58d9df7d687d671f37d63393f189321.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_807545400aa6e70ff49a5f38ed6a218a180bd87f.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_f5803aadd93e33567aa6b23100ce4fbb6c040dd6.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_4466b6c6b2ec3acb40ac1cda432efa1e4e62d9d9.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bbfd025488e52b97c04995c4c5faff371b77e4d6.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_b298e213f927b518c693660110f08bdd94990ef0.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_d090b771a4f9750132f549c82a88b4ab00dce5c7.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_9068ba8df8b0e977e9769f6acf6cfee6b00b9922.hip -fmha_bwd_d256_fp16_batch_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6d17b92fab5bee7717bf9aff6a6bef7cee3816e7.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_dd10bbf37503bbc92af82bc3487989b41b20ca85.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_f0209426a8e6bfeef7d8ae7b16db791888142298.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_e89bcea4393593313d18a4aa6dcb44cd75bc828d.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_e34b7e452a4db74189334697e3a240ad68085f0e.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_615430cb65d8d540836c7f12b3367abd3c8e63d2.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_afadc4f76e237514db0bc0203102297b79730bd0.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_249e6b93baae25dff97a0bc9145a8d328ed3f317.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_c806d7803d06ef8aac1d5caac9f36aafd47653d5.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3163272d25bc2db2ffaa1fea87648b45ee68d408.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_b9baf70220079e6d4e87eb01a7259923d8a01e29.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_c5fcdea177734366d3bf283317a65cc3fffda611.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d25ce4b3e9cc392ceafebc7fe3bcbe05aaad4bbc.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_3bb129e6dee6848043dd0e8fa812ae80fec4d014.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_7d2f87c021e0b6a27b2d7e30351fd50f06414b5f.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_f4c803838f5644ccc6f04f7c8a6233fed0b6639e.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2c82e3c4e445e1e02f14435e4ca01a90850139a4.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_a21f3637624762547af1292e1b85e640b1d329dc.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_c9ba0a3369d4e4eaea1c902a90e6501f232dd57c.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_1914250fce818584291c69a5f058a58cfbd83df9.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_01d3b034a2d8d0b83c0aefa4faac6c3f28ce737f.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5d707d065ae152450f9def619ddc3dddb9089e88.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_1132b11429034d96d82c82dbfdb69e460ad8a564.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_4a5dbf601de5754c03a03a1a42395dc0766fb8ac.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_5a29b93cee012c79d4364502f1d90f947c73641d.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_01e8f0df0c54ce619e5b66441b3c96a5e18b05d6.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_1d498e418ebbf33bed58b4074d1edf3d9bdd07c5.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_4d7dc0f356b630179916f8fc2041b7f1402b46df.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_292454f2d82184ab0491ea0675750c6ec55d659c.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_c538dc4f65d02776875627cbd20a9c794d70b043.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_2d1f2d1e57095f756ddd11e8e9d4f6f253e3ffa3.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_16f94f5c65c37624f5458c165daf83517d9e3c81.hip -fmha_bwd_d256_fp16_group_b16x64x256x16x256x16x32x256x256_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2703018e71d57d3266fc35e2e18a78faa3dd52ce.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_ce5064e27ba427cb951f7e1b01328b0beb6b2b7c.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_aec87e65afa93e84d7a947c52f291c1c7360033c.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_30f0200092b0e18d57a9f5e512d565f1c0229436.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_61896aa9e4e4d7e494c1755b1e77a08e0e264f8d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_487724686efd35731e5335efa949486c93ae26e3.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_3e61b019e1398a6a3c36143fb84b5ff22c9f4508.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_94a94d145e575747c8956ac703810582c819e2e8.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_4d3b1ae63e127b6e6afe39e354d4995afc5faeaf.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_438e3565f4c720e6c9691b0d33c1392936e2e7ae.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_d3fce1e11aee2273620e75efe4aa0390fcde9ba5.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_1f0cad6ad5b172e51c569e84cd54a19b4eb0ed05.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d54b3731883a5f8393d60d27487f8d017aedd3f9.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_0efdaa9266a5a464009297dc59db92504f8bf1a3.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_99f8352674bd6bbe98944a1c0a769a4fc028a623.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_a5f2f0cef657ae5e333d65ae4ab20529a43cd7de.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_9eef1b54d5d3841f3fa6b84cca6c7ad33efa2d9f.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_92ba64cdf615c1be2865f027a293cb530fc07dc6.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_931cf8d05cfa45319f4e5bb49334d35a530bffcf.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_34807a8e90bf1cd839f32fd718afa6469c35a4fa.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_1a98bcbe900f8c141136d18c114b02fffbe8bca1.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_63f121a3c8928c10a2d86b487cd13fa995da670d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_8f607ee20c0d92b6dbd0338f139517fdcce98d0c.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_3a6b9566559ed2b1c85f2bea1c55e72c41dc47bd.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_45f4363f50af1e7ccd24751d5f5b181bf32c604f.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_22a07ecf1a59f72ec6bef3e970d7f33cf54c5f44.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_3400f0af03743dce328486f8fc805dd30bd6da31.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_9b841b7cf5da31f0c30ec42c91cc8d5bd3fedd03.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0e3f4cd28a4c06cc109f6a0798a77844bcc750b7.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_a103cd47156a98ad2cf2c325ea00df3f1d67fb72.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_bd37f4f7914805a97d5073f1ebf8a8b8c2648d31.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_030a759dcc92028b4c6f317fc230b98cb929e806.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8d79fe8a600c3b4e0ec9aa510f8036ba2b608985.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_10ceed95b0a0a01f844678717c88e0426fb503fd.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_90b17d8cba28cceddb3ef907df878aeef0762d15.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_b5ac596c636df55e81293228cbc53dcbb3024e5a.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_e68a9e05debd456a9975953f7b0d510e7a0f6978.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_50f915b4d9bd18a3c25a85917392ea4a5e88b349.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_dd67d442001d2b167e70e8730abde4d4461b8569.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_4160f6b6d0869740a5a411abd80108f729f810eb.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_40357c5e9739eae136a7abf92bc38d3ac94753f8.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b4ec377c44ac18527ca6a01bc3b146706a6e1e09.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_02d88a03cd3966dd0cff550065f58c3ffecfff6c.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_093834d4d3fe76e1745e4482c6b51b550c6f3dfc.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3cf45927b6d931e31e2209685d787efa28eed8ba.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_634d530731c7ade2c7beecfd1bbbca8583032217.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_311731442b756308c0a869f21b7b8b103aa613e8.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_2ae344010d49f7f9a6caab2cb84be7f87d2d96bf.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ae239476d61f48379754b97f29d7a285cc3192de.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_6e7e1d245baabe2f6293e3d85318f9936b333500.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_f6566441ac3074578cfe45758ba0583c0da0a5ab.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_de26a187c4db06115072a5132e1166b5b03368b0.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_71dcbe9f481c92215f3b636bc0e86ce8f65e6472.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_9a20fa19d8d30654602e363806f559113218d66d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_1e22f2d99804198c61251b4629a3f18ed3dcd42e.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_38abcbeaa4d33d3150f2b0238bb62ebbfe960980.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_d0863830fc5d43dc6d6400280e892bb7de2892d4.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0cee6b9427c164d78994150305a47f73954a67c0.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_04caeecbc01667ec6f5599358a0a20423aa9a00b.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_33099fcfc218ffdf69edb4f2f0e46121bea9fafc.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3dba3cd44f78c950fe7ceaa5f0629dfc607b30f1.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_c2f04447e6a94c94a2315454e71d7d607a9fd0f8.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_8e2d5f979fc4fbd0991581a020a414f9c8656ae2.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_1241814f76107d74ed069ecec99a248676487eee.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bd28203f47b6a48e9b66302cf8312f3796ca500c.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_3d289100991d4c8c362f64c8f6c4ba395c2f3495.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_7c23dde1a386436e9864c8fa5f1706c0d2fbfd0d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_bd8bf7c572c1984ca3061062cf3c31d993f6762d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_164a947a6c2ba83a5b1cb7074aee0bdac6c9c64e.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_9b062dd633645772e4f2caffd111af73184f7657.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_abf6c6412f9853855b74a96e862935ddef66f763.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_aebb2441e6cc1ccba4a391566e547402bcf7ced2.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_66968bbf7e210911fcb95ba90c79837230ab1ce3.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_be1e1533fc37b41838bd37edc2b6d2f2e76ae1c6.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_3a2280997eb6f1d091094fc54cecf42b7c9c3a2d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_4b4c03c916393d6be7c5181369ebcef949eaa763.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4ff20bafbf156fe8fb80bdd84a5d2f3a4a944c1a.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_be4dd90ccb2f258029d0156cf23f940b694cf08d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_e334e691714f0b99773c2ac515ed82de0f387065.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_62eb2f81e73d65fddce7ff43c397da6529317607.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_285e61dad8f63fb973cb2eb899c959e400622652.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_a2ef5d30a2318ae06430d17f84878800c4ca7364.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_54548ad36fb92d0963893146c8db20f53cbf0c8f.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_3967a8807c9451b09227c0f685c18aafeb062fd2.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_94f6f9dee9f0c3825d91f4d320a5280070e60ee7.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_59d366421e0b51c90fa53c366d47ed8d51b3a329.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_dd35634440edb25cb095800b882c70aaceca1dbb.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_0628931bf5cc1daa6e106cf60bb21fa1aac6b1df.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_ae4e7253ad4873576052ec0a9400597bb7975753.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f6f102a388ffb05c690a20a29cfe0b35a35eed61.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_235bf652702c2976551778b9159e09188575c63c.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_2a45129fc4995abcb8f880692f11c6186fc01641.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ff453e3bdc9752cb7b81f7cc3056325a8b9a8ad4.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_c08095341ca7e3a1debeb780c1878e351692bee2.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_5de27c4081377f59363c2bf2ea8624217566d2d3.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_8c4688cbd23727dd0ea9a36fb977b31aeae98d65.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4666db0ff7b035e54f2c0e59acedc2131b722a55.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_783ec08544591a22f59dc12f169b7327b4185a1a.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_131691f01cc7f29affb88152dd48c7a484315dcd.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_dcf815ef540060cc7ed43e1c57a28e1d080c5621.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_d7adde8780b39f1364c572a19c3bfb19417678e3.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_cf5c6c0bfaf98f6e655fc443246b81fcc730fe97.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_b18a615e66d7cd739ce35412811359a03cb23a8e.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_0fbddf533661642d84bf5a16149692d5a892182a.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_fde12cd366d6850ce26afce98e5076b695b4875b.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_df0b2bcba57e77d975ec5304fc50cbd09cddf4bb.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_cbe5a98163e878c7697e554758ebd0597c2c1760.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_3cb0cee09d633b6f70febbba63a1e090522cfb4a.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_eeb0e96b759e18cf703cfab0cda1385726f6e0a1.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_4601680af41c8738089ff377147e0547dcad114d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_6f3d098f8bb63133924aab70d26a6ed64018c13b.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_7d08373ace7087bdaca4ce8b0bc329f553f88d77.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4ec2075f394acfb14fae7b1ef4304fd9b654ba0d.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_8a1fd28acfe85b3adac859c4bbffa4d28fe634fe.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_7bb7b63e8a4c1df4eac4d978e166867195bd6e53.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_80fb694fce7b4c3c459fca43c89c6002fbfdaef5.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_86513d6e065a44bcb0c789eed1e7e5456e800ab6.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_31222e158484773d2257f4a31e3dfbdb68336a8e.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_b20c6252863a73341b0010191fad4c834860f884.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_70cf755f1485c065222be4daab84283a9c3d0eb7.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_0c8a0bb89a6f05289c0405df5126fa0cc16252e7.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_88ac7f6cbdfca2e397bcb86af4216e87166601c7.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_db8f0bd93b352d28c5b6d78f4332026993f0bea4.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_96c5e79f54b71677124f555b0ae4bfd27248d099.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0b532fcf26f90c82a792cde7943634f667c1d033.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_b6b17ae67adee9e56a022cd2a5514fb9c4e99920.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_fa62a97675719c2e8e9bb97361b92ff1c7b9d2ef.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_173c44dd85077e6b12dd06fdcf6b11ba349e1866.hip -fmha_bwd_d32_bf16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f861d8693f82d22e2c5b1abbcbae5f30f4433e5e.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_970073c70133ff2ee4737f803a0ac43801c47242.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_5aba1183efe205af38e79a1b2dccea5fa515d02e.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_322a86568f89a5a5a165cfffbae9ca6949f2477e.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_321500dd4c41e4d68834814a48a639f5ca36a2fb.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4f4a5d56721bb1a1332a65882132a8c5763932ec.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_44d82b58fdc3e5b7a7c20490ce7f5acce4e6ec79.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_678a4a8210a972bb2ed89d6ac754fb79438ab2da.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_085722b43cde5f37242edb071f639da7c4a0bd48.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_417b1cb14b67dc82f614831550f7deb0895bd7e4.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_7ec04763d635c5bc3e810737b5d948c59f117d5a.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_7524904ac5a2040c7ea72aef5942212f291a21bf.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6979ef43adffdb62100270a62706fb811963925a.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_5be9ed84ad9be1627db7a66af9370679816c0897.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_ee239db5a67c23a383590a651f0d8a0be43a13c7.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_69214eb450c3b249017480efb8d092b0edad6dc3.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0c32a2d9701e23dd930119c4ee8089042b5b0ac5.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_97246460c21bc66c0f13936d27477a9fca1c44d1.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_078b96ad691a85eebd18586db0b62b8911016d9c.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_7ee953cb24e28bcdc8f05783894b23cbf83bdf35.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_921f789d619db6f225e8e9d646e93bbc9dc1a669.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_28e4d2c757e4b8c366a2c320360e21ff0ef671a8.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_da6afccdee4107507a64323e17bf12c46da2b92a.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_38e12dad9e3bafe177ed3c27c833825813e18fc3.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_151a4425b411596c46c7032f6b83d3152a0e0cd4.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d1d3eacc320104100bce46235fe656e5a8223c66.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_a71305f191f06cd53b7563971c706e8b71b19e2f.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_482e34930d11ff493007b1613993e01acc1af78d.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_342d29c85070f488a14b1915f948e5fd69019c99.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_b0f555b74ed36f1bef8f47880b3edc6760f27788.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_c42ab428503e8f8bfa78c8cb8d9afad9f5185118.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_cae6c7efbfc831e2bcfc8c1efa1a486c02627cbf.hip -fmha_bwd_d32_bf16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bba10ecb79ede07324e1198a71a95ff26e9eb235.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_48ae3af78583258c4b13c11a442022e0e058bb85.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_82048cf91270631f98ac37dc488a1fb2e00ce004.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_fb4c5f8fecfbbe16e6648becb3b5ca89fa3d8a94.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_6abeb7b50ae6a1fc62535b9a1dabbde6f177a9d0.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_8a824621a50cdc3cbadc4b1f9ef18e1325385082.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_f69548d6cced86c21c09c6475237a0cb926df0ed.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_325fbcb9e503e68fafea08abf86a4951f440850f.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_964f916d3484295b5918e2e4c22c5529588a5662.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_df645b3888dc8d1df50c47c0d75822eebd3eb019.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_75a310a6eb86e3e8baac7a930c3ffbef372942b3.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_458d708d13577f2b92e6d5adfe952a87e0cf7be5.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_15fe3e8f4add16a088fe44458353fa7c0c4f9658.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_0d0e0147a92061d32608a34e7b47bd534eb787fa.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_4e15e4f16de26068cba30ef12fc29332d45e460e.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_e2bf6805a489739abb77c13173d57723e9304afa.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_15cf7068183421b141ed5d6e7fe902d06b6492a1.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_703246f1f53a988cf252eff88bdf814bd382d3ac.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_381b29d9888365bff0f109d897b508eebfd8a61f.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_f2da112b1e07c44fc8a7f19368da203f6935049c.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_1886d4bf54b3a4a9e093360998b2059b3c03d072.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b60a4e87a7aabfe3c1ce02b408522f3ec862e3d7.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_a62a2ab489839ea1a1bfd1b24e54a3c232ed934f.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_36a0a960541bd8a2dc6741579de685b7c0a5f6d7.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_1f6bc5faf18be193212217788d476ce6fd384bfb.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_13f747525ad31e76c88774fb2208e470da9c2310.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_71b6100efe30d836dab557ea4ac54c4b9d35c6aa.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_62ab710e4acc711430745e05e036dd6a4d6bcdca.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7597ce4d2e5264bdeda47487d5bdb55a014c6616.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_ec7ec8d547ee9713aa3b5b667f22cdcaa8f62b2d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_4fe530cbf6363a8f08a94728e45e88ecde299e7b.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_661ffaf653085dd7f122d603bb3ba4b001e5f3c0.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_345ea796c8d97bfe3b7c9663bf15e2e5e7696235.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_802b21f9588d72c3c3e3b9a3b269f19c484d5aa4.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_9ae866c7db36286876818bfb718ac35204fa3843.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_faf56e45b2240515e97fc1bfd552eb03b6de5094.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_fffbfcac254e33926131a71905e93f9cc0aef89e.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_919ae177b7a793fa352c4f6bb8e4175f3064d814.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_ac9382cf8bb56ffd962c99329bf67da992f8810d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_cb1deea4f4fab0db31d46a91228601f0c272d6e6.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_144f19363ef26efd36f0436cfa9f84f181a8824c.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0392491c5a6dfc742c2be483419a40f6a7a7ea56.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_cabb7b12cdd9b8b522af577e13232b2459dbd38d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_caede7a18f3e3d5e24f6c70392413a2cda16ac15.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b9d00ab8373747a5c6b9d2f8dd50ceb14db4163c.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_ce909cb5f96a4884caa0d2eb8c5e6bc7fa352797.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_1037f1bc50c4a65dac09ba56b701256b701c4322.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_aafe891dad43815e635f81225705ff944f990d75.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7afd1a756247b15b078d15a39e350a07c22982da.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_3e839660557dee9d5bcda9b56940ce23236c5f6d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_fd26e43ca652e6f58ff48c356165aa4349833b55.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_b3486244e0b7d6dbcaa1951e8b8883ce441c3f99.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_90da0d469cca5c8481504148468460c85a15c559.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_714c5369aa848021e020d874289e3ae4e0f74d77.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_cc54b107e1b557ea36b5cbaf7fe3dfce05415c86.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_00042c36bc588e60a7c8a9ba297a8a25d8ac0660.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_65794d9c185b21f59274ac5d4db10a7abc0be968.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_faf686067fa433cea5e95dd523846dc881eff635.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_39d3071347a0c98f3221104036f477aa13bffa4d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_e76879f8ff4796f48ad87ff8003f4f6e6adca9a0.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4377ac04be3a6cbdbfbe57612a469412812fb5b5.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_c1b76bc7a17f573c0d52c07ae9ff4302662ae61f.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_d713fe25dc90b3511fc259cebf463376dcb55d84.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_89a3327da9a3411ff1cddc67eb647083cd947a92.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6a7eb3d86aa385f9ecffbc5ba10489e56856f918.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_4d65e58c9f147498ed04dd51fe1393770603a6d3.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_a5c0109313de1f6245d2a80f8539485b849e9d55.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_e73a776ae4ba68c23acab1a5a6381684051738ab.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_a225c4f1f3c7b271957768bb9235131c67afb48a.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c9530e20038eb40c49bc8b045be0cf4e7e6b4eac.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_f51f1a11f778d99a00aa5959a3e58a41fcbfb1e3.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_de36bc309877917a18fd21acb30563c7e2f233c1.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_4b45948f2795293e72530b02669c4f549608ea7f.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_05f794c7023cbb7e35f1fd1ae45bd2377bfbc520.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_82f1d7e1a93bf2fa80c409e6827ea88af56c44f0.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_4baf664bfdf070362bcc91af77d1bc406f744351.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_80efc341089a50ed5669b3c86f6ddd9b124d1442.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_e465193d97d43237c22c04478ca5833011d8dc8b.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_915b75db795dbef037b14b003ee073665fe35d3e.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_fb5bb49928ce5515d7b297d5eadd4ec70a22d60b.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_349241529745bf138552f49d9a93db418663ad65.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_c4de1bc135191f3c2aff740f4c6bb7e98da42f84.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_4ce03571f1d2779bdeaf0a6a2d617e236d191c11.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_ea077e68dbc1bed2dd20a5f4dd35e0cad6330ee4.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_c56aa150611b0d4800470c1493dc907082a5c23f.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7e9519dd0d0f940fd5efd61bd32df7528ba7e3fc.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_47548aa042c69bb9c59a8bf706b44028aaa41830.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_3dff884e176ec7cff86d17c6afe1ddaa4dd6007d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_2081430c92864c29bb9f409e7c27caee1de00749.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3d1cea88a2277b87d405025ba256272a1720f88d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_a55c7dd576e5b1061c059e5e99aeedf4389e2d25.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_8c074afcf33e3f3534ac3577484237fcfd2ca48e.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e618fb4e529104fc90069c8779ce5463460bd516.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_44462715ed5f192532760d6f4c66ff9d4e20e254.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_e1d85ad2c9d197f501267fe0804e6985802fbd18.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_78663faeb0425f45e8a0da0f7b1a5ddbee5e07e7.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_15e8e1ab8c63db96843054bb7a98d708ae6a9c44.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_629e0b97b3fece7c12504f4c8f1860d611b57269.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_29c9e5384809b21f39e78bb2e43af345a9a21d19.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_14f77aeeafe4b28f314fde5ebccfd2a554872781.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_09d76cca48b71dbcc9bd96734787209fee4c9a74.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a4980becb0d3149fee575bad1fc3b463d08aabf5.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_55bf8444c1c26b91fd490c7216f4d0f8aa0a1f1a.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_e4d9a2396ceccdadab24602f30e9070901a76dc7.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_14fea611f3c253aebf726af3e5fdb7e63e18e13a.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_587fc33d02b1932235b8d152e57559060211d591.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_680e81c3700f130df142c9a37a368944ca548721.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_62048a8ae1c0096f3372b0114c15edbe813425fd.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_9b4dcde1ae3446b825dea739d4295c1d1ec5c4be.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_ede81dbc4cb208ef6e684c76ba1eb451d37fe10c.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_59901147b7188212b8d8feea15831a11425fe4b3.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_c9ad71883a19b522486706d3705700c012a6fc19.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ad82071cc074fd30437f6158b5eb2c6df1f8c587.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_bd3daa5f99b4522d932334924347353ce2854821.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_b72a804bb3c99830653d41ac0bd49943c801b89a.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_572e68bd619e118292768f0925ccf92cbfa68415.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_ee1a43f2210a8d1e5623411c95c33424cee5e747.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a93a03b33305b33055273711ab31a5b8d8298d5d.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_c3cfaf0d53869c373f6d0ec821b008dbb819141a.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_b2af5f5b5ee3ae964824a3e9c7bbeb5bb39c557c.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_56964a17f902257aca9d08c736516a2c67d9a0e9.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f9824fb32933b27501ae8a7f43f460a2dda6a814.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_4118e3ab290263ed2576feaf22a1944bf2ddcb7a.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_ce5ad502dd40353312d561e9f40aa478c16ef5b1.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6d07bf9c05e41dcf2416e05dab4bdde17158db76.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_b1c5d55d47d6038e9162d32ac968ff58c0942938.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_30c8e4d5c761fda50e010da779e8e4730051d403.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_193699a5daa14ca2def07489e0b563149bc403f8.hip -fmha_bwd_d32_fp16_batch_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c0342686e4efd26413c6719782ed13603479c4e0.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_fb79e1f9231692d736dbada062ed6821f34927bf.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_5f3c3bed2b584ea2031debf9f953f5f8f7012171.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_319df310195191895005b30151da8c1afab6c82f.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_6af23d1460abfe875e71f7911697c42fef0f41c5.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_cde0582e1aef74f9209de638b553ec0671476258.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_5052b2318dbb78b1a82ef03666a35a623f44481b.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_2543da478310245e19e6c6a0d9ed7ad99540b3bc.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_20f7ea0aabd069362ba4bbd66623cea5b6e1a6bd.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_da74887afedbd67928fe4d596709f9ff92530611.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_155c3549d067464d186a99b8205317cc000d4898.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_089a347aef8a920e3b59d5ffe71fc5bfe002609c.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b0dd965d5d9080ed5c6a04b7eea9890f3a264f20.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_2db33b5442d2e0948762b1f2147a321a9d6907be.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_1cc459e57bfed5ec7f40ea4a4dd9f72f3ad7a709.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_02ff94e3c787a7b06ffc90c25777fa74f225e32c.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_86309c036d96367939ccc3e8922595ac35a3e179.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_8b92990df507e82f96eeb7aa3ec00c01437566fb.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_26835ba70606c769e56d19dbfe74061361aa855e.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_dc1a7f9b1afeba6690fdc0d0d1755ea89c805573.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_815918206483d2ae04a45aa67d69dfb986587214.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e1c1a31a1d8556cbe0b6ea76faacc78855108539.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_54b6e18b10d529eb6b32d7c19c59eaefc7184376.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_a622fa57764ec746e02f6d4bd4846b48c722b807.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_3b5b3c218e4a7b459e54080e24c5b730221eac02.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_85fdde4b25e2fc8cbdd46c2850c19eac8d9af8f6.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_a4b7f10440331a8a88ff93ba253217c2832bcf9e.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_4e9a933b916285d9580a76df543cfafc88a536cb.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fe8b8c3525fe86a20a2d6c69585f3e36c16caabd.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_7d12e9cb599d24631c082e3cf65d2c58b6d4d44f.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_8e812705ae3e452810794fa7caceef2ef6066dfb.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_7cdc419d4248dfdeeab1f0980aec35fa134e52e0.hip -fmha_bwd_d32_fp16_group_b32x128x32x32x32x32x64x32x32_r1x4x1_r4x1x1_r2x2x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a046e888e3836b0bd3c49fec8e1872e880798f0c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_8278845045d68027dcf3bf867ecde2fb12ec51d3.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_18a4d71b31c451a50df7996e3db864bc3c3882ed.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_5c36fc744dfb0d985c9113175e76c7ec1c935054.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_b779cc0b0380e1e6a2b51fc6216fdd72215b882b.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_19af6a7f9e5020e8d0f0ca0f6258001f6ce592c1.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_459ea3713aef9b916e1b38a882a45012930924d3.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_977137b371df841993c8d0584be7d83aca6add78.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_7497eca4d1a18306b406b367653622a8d64095bf.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_9bf235679af1ca03a6e601b4cf6cd0416d1c9091.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_7177f939ac3dae8749cbf4232dcf04d2cf63b48f.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_1847fef2c06ea581b0ab31af1cb0556c572696ad.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b5bccc85f74f54a2ceb17fe3040b04fe306c53f9.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_f7aa9c39b06e55bf4bc9f9a2a0fb075c9d4e69ce.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_a78fecb9725ceb4bcf2aa037d43bc43efeb1c3fd.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_f93bf815b520a9d9e17b43bf9d7fb870751b6225.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b24f91dec2029b25d0d96962528410df55a468ed.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_00a2adbe938d458d51ca5fc4020667a215b672a4.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_036887daf6cc092e7422a17882488e59cecfb643.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_1a96f0ac76f117e66eba97cb990c2350561ec2ab.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_0c3b2ec99fa7b09c7f78dcc3142a661d686044ac.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4e760de14b71a41882ec4a2c7362565af36d1a5d.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_94aa519eb57e5797125728492d9330f5c0f0670a.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_6bad2ed9f91bc1efd89ea66cd5c775fa140cf931.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_9b73c92a13757877f34bd8a13c6fb29b60999020.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_1dc6e599144a093203fd7f92ac6d3c2cd7180d49.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_7e6129eead18d13a4a6cb9550384fddabc7a2a16.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_04f39b453505f68a5091f68b1c3de48369d1e7ea.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c5b440ca9a5196ee1e72c878c87d96934e9273c8.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_cb4576e8ea5d59d7663f3760009a00a19e1b0667.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_44690e48f30657b0fcfa26fb3b9af3ef76e792e3.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_b872f9e6ebe330cc1818ea82b53acec79a2f672c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0fcb7492feb79e27e0bda73e57ef7dab410e2bb6.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_7a242e5953f44316b6a4f6587ec26283ed6cbcae.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_2184fba2eec5899bb40d49d4508196e6be1ec1b1.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_06b74acd9abfbd1c4ec2f4c718eeb92a0bca7bab.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_ce5c161b725becf059fb4439c668edd454ac77d1.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_addb6a14043c5a4df0f5042b3770b40c4e90795c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_7ddd621da88c57798db1e689b93b692b6519ff96.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_b0544a38dfdf4d81dc95894387845f48435e299a.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_11ff174ff2175e9ec22ac3a0fa59dd7713b79643.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a3f9c236d24b30bc9c3fad90cfd6eb00da835de2.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_515128c6978449b33ce0c35b02a9e9aaad65ef7a.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_0b3153af7bcdba33115a0d31f121fd76be2ffbcc.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d3a2edf232786d458e2125f8dfeda8847f842afa.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_a7f7553a7d2f6d42fe695cdc64423c85223af440.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_a9b50c6ebb27986ce5b378d8c39315eb9cb91dea.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_2f55a23a0f24ff7062a4c286944f25d2db3e20a4.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_0be8cf70c6be969ecfca675782c860b5b75ac089.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_1e9130607a2d24cb0662a47e9cf12c6602143838.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_cee81ab2e2678816c7b516d2d4c50e8cb5874c68.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_c5fef330a975002ed15670e8e7b26a10376d3cb7.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_0c9bd38b8f9009d932ec49204fdea39a52885246.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_82c932e6eaaf44861c794539d9caf8b50192fc44.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_4568af1b2f104664fd05d21ad789aed39ecfa42b.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_d9c23b7f8fcc4e4f4c81f5f00cfd345b98df2e0f.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_de7eb562a7eff31d589e12945d80233aac202ae2.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a92b43d374642df991edef1f6036dc898bf77cf8.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_014c209d5cfc6b965bfd78c64bf132c0154e32be.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_1687ddf65ce4ed2997583e20fee9f201e86633b3.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fc5841a729099340d608e31023acbeaeade3e886.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_9cc3ef3d3b36f52089548e9dce522b0448e2c26a.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_8efb5fc2ace6839eac741c5e6616665845f43566.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_ef5421703cbfa63a58ec02701e245d479a1fbfc1.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b50e6df20a2426abd3d2ff2262a37c009196024c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_a094599fb5caf5e7aba728cd4713a8d0c6368a46.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_21e235e31d6955393ac8e825bd69ead70687b7c8.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_289071756e7d0582eb61ce6483fa3c988d2e10b5.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_1899e28aff2fb168cdc3af7132dd7fd09c2e1ced.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2e30f50071113dc4ab59468d568ac9deb06b0342.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_bdab172627718278a71a93e3737ef08ad9259a4f.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_77200e875e0ef160b311c7de450c137772312d0d.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_cb1b91c16e0255fe7a0a85638b98d94634e143a9.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_877e33463b3bf1853c6d2d2009af8d27bf88abbe.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_92e53359c69bbe4d7405d45261a8a62008eb7d06.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_7764814a0de7702f0b7b5ce9dede6440603f4853.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_05dfe927fd64a564c5fad537fb7c41ee9c94c2c0.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_78f7e2a2c08cd87702793f91b6935cbe4c22be55.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_d4605b2ad3e3753c5f255678abc1690b949c5abc.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_037c6c80fcec3eb8b0bef50ad6af6d27bf5447f5.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fd9cd1305633b62b68fb8474ce021f639f8492e7.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_d2f4b869ff23874b6bde0aab68c419108b7e69f4.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_6ff58a5186d69efd6062f3717bd315394ea6592b.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_8021fa266c77e6b5bd1af2a9c22c686e5a6eac78.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_de5359f0fba3da9dfed06ddbea8fe2a33a9cf40c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fe72cdd69944d2d765478d4aed13066a02b76f6d.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_6a7b6781ffff9a42beebb4d73f0d15461ddd4479.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_28f7634d29bef11fd466b452a46b0612f38c949b.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_66f651d3415562206c1049b172261fddba01ea6c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_32438250078ba2a47345ec4955dafb4e4de78a25.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_5ea53f7c6370845fa94aa9b395c52fd1900b62de.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_d50ac8e8a03f8e7ec2c6e993dd39f09f465dab57.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e2b629c37cf94134693ce455b8c88b72a39df7fe.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_157b89d8d625b8244b5cceaa4d3e5fc5a09c8989.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_5789f267d34c9961ced63ad07ffea2c6d2911415.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_38010c9bf7341588f071f889b7a0b4dcc4e7a14c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3d55cb42b0096a8ae338ce100f86e378aa1a04c9.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_11e7df31541c3aa919e9825ad7dc4432f9a03c0c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_d7145383e39dec0e346b5094401acf85ef3c2075.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_04c363e11d202c6d2f4bb753661c5a2043edc0ad.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_8fb33fc20f2e85e915f1b1529ae87981dfcaf86d.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_97851d5ecbf02f8af623988b1a39c0b91e51533a.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_9163ae070075f26926a86d39e15c27e6edb1f1cf.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_9ab73ea77ec20ea3bfaf995dacf93a6960ecdca0.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_21828c7d3f5574690f12f841c27f025206e6165b.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_dc08afbff5def8bcb4e823657ce01f57c9dc77c9.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_875b08ca602fe48840c72cd61798acb98540fcd6.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_216806a4598c885e517e664fc8280c59ec3cbf11.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f90410c26d7649e21e2ae5e32e7af89d84d2ea70.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_a3339150d8bf9d073827738527f6cbe15b854607.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_7a0ab620e6d62259a559e329460e46e6e3f7c3f9.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_7a2e032f6500fbc5468183415b6dd1d3e43f0bee.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_71a2d046629a4b65c90d0e18d061c4984062f844.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_04ffca078cfab8bc6c4ccd1cc8994a1bb4a88ea7.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_836a308c2d2afd6e0dfbfda61984b631c4ccffc6.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_62ba7a5a0f3a714eb5f9f2af20f7bfbc82a30350.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_e9b04e6d5527ba0b8089ba8bdd264e2d5759338b.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ce5b5932f6df9a194ceb0d69220fba9596528eec.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_cb3d5273945c5d40cc05c2660af2df1fb7a15f3c.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_5ace1c9b00f160a17355d4583d49c47887ac33c8.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_054fda16133a0d25077967b05425f9128e1fe1a5.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7adf69b51f0a8cc9ae7e250e60df38758230fe4f.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_cd757a8bbeabd16a44d149ab188430f6d79ddcaf.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_a5fa94bb32a80e81886b711ebfcf2df5f5405866.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a02f152e9184af0b3d77082d8bdf519dbbfceb2d.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_cf73e1fc0015094861ca0c1c81bacdbe0c5b8f37.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_a9df9ac4ee78e5f4d5bd0567e58a7090907c61e1.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_92121fd448b4640a17e1a7fe73bb7b58714c0afb.hip -fmha_bwd_d64_bf16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2c9756060ac0e73dbcfc58a9222a78f0283cd029.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_354121d3bad1d448bd413718fa096f54faa12e95.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_d4c9f975891087e6eed6393629b41155deafc509.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_9bcc791049e3ff9ebc1a9085d2d20efcc2f99b71.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_79d0b8053ddf99a4d4447656d733c2da026b3a7c.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6f8788c537cbf6833c58a6ca15c0a36de33c9fbd.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_5fa19223cf296d7fd10e15e2571e63c84a80fbb1.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_4dde56efe17f4fd36a11cc959320a5e43f1dc232.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_fabdc143c29d5ca50ab1e96a814bda6d05b0d5d2.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c977735a36c325706bd19a12df66ed0839b032b1.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_7872c45ba170f2782c4b5b75cfc78ac79a4cf157.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_7c4710e8f4e27fae4ae079f1667c3a1879cb6da8.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_09e50367b62bb09071e28b44235a7c112645a706.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_1be43f8b629e7039f57b95866d5777273377470d.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_d0de618ff3ea9f67b90f2227fb7fcc74ea34183d.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_865eb90b1a2d64acc0f6fbe1d807c501fd4be3cd.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_bb35c86443cc9ea38c06ebc0656306483c95ef67.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_ec171210efd217c07d357fcf42e5372ad7e9abab.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_bd80a1774d8b7d8bee4e8663392b97cda11dcbf5.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_b19f05f6848403480ba41d37cdbf44ccca1b1f8d.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_e639a1e84faa98477b05df71d363b9ff0f9b2760.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a388a284f45f711d82a6ed87036d87cef1872eb1.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_26ea90eb5a527434c1740933a1d2dd863eccf14c.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_e16edb824cecf459a8ec51b8dc74b1e06369aceb.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_843e7888cba5f463d19fcb71aaaab25dc3d2c09d.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_4f6243c6850c0a2d2b7bf1476e12f95f187257b6.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_89617bdea526d12d6a33ed42b9b0018c0b173722.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_2b4050988e5790a28dbe10b4c20e14f10f6cf85c.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_44cc95831c347212021c0bab7b43acd7daabce42.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_ece60111633db08f765b3c7cd5cd768cbd030255.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_79a7dce707954e765d97cb22e57d9bd6168860d9.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_761bde840c0c8149b24a8f6f264e963c4e9e8ceb.hip -fmha_bwd_d64_bf16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_609616f72bf16a060fa50091ac139ddc06bf9d88.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp.hip -> fmha_ck_autogen_ca1992a2634cd6674076611be54197c715ad8271.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi.hip -> fmha_ck_autogen_2f0247e301a7b076b6ec8a778c3b47e330638963.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_deterministic.hip -> fmha_ck_autogen_55b14cf2998a61611d1de2594e926fcdc378999c.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16.hip -> fmha_ck_autogen_21411df58165946bf02942b597d94de7dd856987.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b3063d06723ac70c5f8802ab49c5c35e1debf56e.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask.hip -> fmha_ck_autogen_4052ca6a3ec02f6559e4bbf1edde42ad2d127c26.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_deterministic.hip -> fmha_ck_autogen_d41cd6b60a97e7071518cbd1a63abb8b910df024.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_e75d492ac3a6ab75648056bcf26250a4aa929cfd.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_474fe2d739eca8c93fdcb2c105d4154cee6ca1c1.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_deterministic.hip -> fmha_ck_autogen_2c0bda0feaade2b554d648d72f219ac9c389bf09.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16.hip -> fmha_ck_autogen_2122c973581930ab7a4ebc90b3bf1cdaa229a87f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a20c91b2f11bb7e5058ca7935b0bda4f5558a9dc.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask.hip -> fmha_ck_autogen_9990e6ad243a48b84304b5cad0c663c0802aedfd.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_deterministic.hip -> fmha_ck_autogen_7264e378e1ea1d4dd97f6949d66f3492883b663e.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16.hip -> fmha_ck_autogen_7878e2a4d3b96a552e03d1ffc33debfd50c9f7f1.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fc1eb85a00017efdc610e4259d2abe935b85304f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps.hip -> fmha_ck_autogen_cbf3e4d4d4837a0cb33b78c4f2767b1d93da0850.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi.hip -> fmha_ck_autogen_5f8925f929a5b26f3544ca31938aa75b3c59d34d.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_deterministic.hip -> fmha_ck_autogen_8004763f674dfb3f14b66dfdeb2a046e413ce2cb.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16.hip -> fmha_ck_autogen_0878b9aa31429d23a93cd953cc6a2fc5f43d0d3a.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b5ba2e73df35f6e0f7317303823fde92a42b1a35.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask.hip -> fmha_ck_autogen_d34fcb56caa8f80404789fba0ffac447483a4d84.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_deterministic.hip -> fmha_ck_autogen_cb1a0ce432c27f4cfa51731c3ef181bf60c8a727.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_efb9e7d9af47cdf79f15f674f8976c05f08b0ce8.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_357f7e626135cc9176a295f3d1f336a7c3852688.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_deterministic.hip -> fmha_ck_autogen_22c142d869ef940ca876c93033ad53b576ed34f2.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16.hip -> fmha_ck_autogen_1621507cf219fe608715d4e5bb6e5764022e2d61.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_dropout_wg16_deterministic.hip -> fmha_ck_autogen_a25e2aed617e1ff31f93ae7e054313ee0dceee97.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask.hip -> fmha_ck_autogen_7ec038393ec329a894aee9bbac078a40f57a4684.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_deterministic.hip -> fmha_ck_autogen_15dc02ea7e0908cf0bd48034f5a49debfaa36219.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16.hip -> fmha_ck_autogen_758b211174da0f398b2a093e7389905b4f9c4060.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_ps_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_548b347672451e8391388a400d016803f4c4cf8d.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk.hip -> fmha_ck_autogen_ae7899b1ef159ecbf01f27014601eb79b31b49b3.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi.hip -> fmha_ck_autogen_b04f14f829eff73afaa57a875f74ebd1e6860979.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_deterministic.hip -> fmha_ck_autogen_2ad492377add5c8f6d0d2dbf9ee9e4338bbd9f1f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16.hip -> fmha_ck_autogen_7f6ccdb3c2d595fffd05bc5e6417b157276547fb.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_69cbe8eca7e3510f5caa7f13419cfbefbf031754.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask.hip -> fmha_ck_autogen_8bd7b8c63a51c8639b3cf27ad09d41ae47c480d3.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_deterministic.hip -> fmha_ck_autogen_f21596e8c608a795ff971aea8e199db9e72b65d7.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_1da23de9604b5d98fe02529075bad995954c12ca.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_49d4c005d723cdab9fbc307933c1257d114b539e.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_deterministic.hip -> fmha_ck_autogen_e2c9f955f227430c6224ebc347649386be7f01eb.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16.hip -> fmha_ck_autogen_290c484c2a366258941ee0051e139ea716a9de2f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_84cca7528c7d1bf49ba79625733ff0ae7522c096.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask.hip -> fmha_ck_autogen_f3d0166931e4406873d8f552a5d5b61fde2391a3.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_deterministic.hip -> fmha_ck_autogen_8046f566fa7188c92568b277354e8b06ad382544.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16.hip -> fmha_ck_autogen_12d60c8abecb3bc9b84b0ea7851628ab17d8b0b3.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_psk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_f50fa4ea674a590d0a817367ad9915a5fce20c51.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_0836d5dfc0f939ab9a4064b403339373caf35b56.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_de6683d175affaa5ff261ab8503f64172d8eba8b.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_beb9afccc15de7dfcb2e7d898abc0d61201de73e.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_e6e0ec1db1ea308e226f675e68e29b839e41b252.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_7c3d8ef4da515960bf40eb1feb04d21950ad5ae5.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_fcbe827108d252b2f5847fa8e132c9c3e56a90a0.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_7993fc08ac5c6ce7a2eceb1227f4e3718dc4cf5f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_06ae52ef937cc27c544e32025ea0dadb7fad982d.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_876a418fbe6183d0392b7a7d9986d067e323e2b9.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_b03ab68e33844f97aa58d463e00037bc11c50da0.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_8c7970957024de050748d3e31cef434f582d968b.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_add29e3e9828911a117dccaa5650e77805730d14.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_0e007c36231ccdae12f102eacca1f74b0711b9c6.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_765940baaaa2ae6ade43ef4c94a220eaa63702b0.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_c7af2bbfac25de2853be344b9f636226c1c0112d.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_b2f91e937b427ecc932c0cb0c90b2c2378db0be6.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv.hip -> fmha_ck_autogen_8da8285bd6182355e3164cdc5a983375cdf0a61d.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi.hip -> fmha_ck_autogen_a3ff8445ba691807caadd9f26e7eb90851875280.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_deterministic.hip -> fmha_ck_autogen_9c4fc7cda4b560040cec93f63021b529aa1ee3fd.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_4018b1fcee808b6cccd131418b6ae9e8bf900d8f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_88d52c5f70abb525b9c8aa8fc1cb3997c33ed67c.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask.hip -> fmha_ck_autogen_99e2f290b962f1617b0a9d4fd6d55c43e4439d6f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_25938733446b6c0dcd159719f08d04a9aa467967.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_76f884e9ca116ee47b446efe9fc770c178a858d5.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_42e2326066c91452335eac05f25a6311376bd9e5.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_deterministic.hip -> fmha_ck_autogen_24643917fc970c043d1c80d8d4b17ec92deeb8a1.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16.hip -> fmha_ck_autogen_d937609afa8e21a761dad6b01ff3f26346e450fc.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_59beb9cb4e161f9dcff79080149076488d436301.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask.hip -> fmha_ck_autogen_fd3558b4c7a667dbc365c4c2ceda646975408f51.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_deterministic.hip -> fmha_ck_autogen_dda8d021381083bc48b7fb1840729254dd8e5137.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16.hip -> fmha_ck_autogen_ed37ba962e0288e2840eb0925d016b5a7e3b3164.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5467aea26852aa9a9e3dae76b906005ddf6fbae1.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv.hip -> fmha_ck_autogen_76be322fc072ca19baa82707e260c6eba936ae19.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi.hip -> fmha_ck_autogen_c921a4790f982d48bcaf950123c699647afb739b.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_deterministic.hip -> fmha_ck_autogen_76674fc182dfa6329c73a354aa3adf458429444a.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_54402a22ceee3b665a3f24edb98b8398c35c6f5a.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ada016be2bd0e377fbe01fa7adb9bbb8febce100.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_6db86621d626722434f2ae9b7b8ab435a8dd8827.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_162b0dfbe3f615b1d164290799b2457437a0044b.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_628b28f65f19e7d1b22fb3b85b7cf3d09cd54ebc.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_031b12f9fd94e01aaff2c0da4f35f346822087e4.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_deterministic.hip -> fmha_ck_autogen_b9a742ceeb6736a2c8f9439d0b05e10d3e0c5c6f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16.hip -> fmha_ck_autogen_afccf699f593c828e11efc053b144044e45b32d6.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_fba36678d5047ded97ee7a7ba9feb9569afdb6ea.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask.hip -> fmha_ck_autogen_14baaaf1e90a075ab802c6e7d97c4b1605c8bd72.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_deterministic.hip -> fmha_ck_autogen_0237c76137df14fb808ade8bd6837045f2aaa5c9.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16.hip -> fmha_ck_autogen_c2a2856bf9a81544a30d535a13554e3a8107c476.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c2940fd05efd52bdf8a3f9aa4b78bde9b5809b34.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv.hip -> fmha_ck_autogen_d049a1b8f4c1c6d37973ce38593efda1de8ce0cd.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi.hip -> fmha_ck_autogen_f4b87f983a5e84582efa1663f84da76cf60b5f6f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_deterministic.hip -> fmha_ck_autogen_4db2e63cfebcf84043f79be0321708cd159c62b9.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_f25b87c435bc5d7d85d738f3fdf68947d79f5a77.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_540bd57333c6839ccf5cf2e928edb996bc60c371.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask.hip -> fmha_ck_autogen_9583148fd684a7e6a312127e023798278415bd27.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_bf9cdf86a7944cd690b0fcbbaec235863acd10bb.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_2da2b905c4ce32234c2af62328adae6b1f9217a8.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c4015f0d0a7a5173810f6f17c00065e03fc61a89.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_deterministic.hip -> fmha_ck_autogen_d773df9ccfc1ace90fe3afb5c00976deabedf6f8.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16.hip -> fmha_ck_autogen_d137b7b6e04e1caf43a62bd6788a75361cfa98f6.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_adaef10ff2c5d89530310bdf1d53a194f06a94ef.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask.hip -> fmha_ck_autogen_1be746990a2032f0363ad9f9112cc994983f4706.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_deterministic.hip -> fmha_ck_autogen_55bd9c4f1b7a0621c67f3e964d946ce22fb2fc80.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_4dc87b7d385e7b092e4706c464217b004fd8a6a4.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_pskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_91695dea4171747fb3cc6d910459f800608d07c1.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_c137c03bf161b2ec6a9a046fa49d7bbf80ae47b8.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_83080406598df6bd3102db70a554e496e29db96a.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_03a71615a088e972c998f9c7cb44566c268c5124.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_6214f820b39a8ba81e547a78ed19a909ac13221c.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_3e2557f206fd81d82a3b9d59113105040beb891f.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_461737a13e24009bf1a5a4b780175043a9f2e33e.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_83f6a1837a65df12b7c55d25ca28cc939c2a6328.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_c59a22c6efd8bb8815887325aa0b739e260cc754.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_6049c01db99fce654e9351e711b113cf7424550a.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_c9f28230817c9d9805c41dfcd4e834fe302e1df1.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_7728d5bec7941c9b6d5632bee8d67ed92b9c03ec.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_28f1ef32c4384ec26f3dc5e3af6a74fc8cebae92.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_594929c433b049a8cf949ff476309a8faf5c25fb.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_8441910c34830ad2459fb85c2c14af02da718fdc.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_230861e81e5acc523fa680534eed757b7b4a4e1d.hip -fmha_bwd_d64_fp16_batch_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_c112c01d201c366bdd7acccf2e1b18b00f671153.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk.hip -> fmha_ck_autogen_6b638314efcc4f16aa4a6e58e6caf2fda1711519.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi.hip -> fmha_ck_autogen_c8f6461673882d636772ae4d26e78eabcb568f31.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_deterministic.hip -> fmha_ck_autogen_f93bc23b8a4f1e0fc5c5756c4e1c835bf59dea09.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16.hip -> fmha_ck_autogen_4356b3a2ff49f72b91a6b9c215df285f2798ad47.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_e1cc934ba7baab1a2eb062df1e4ee5066e9ffbc3.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask.hip -> fmha_ck_autogen_137fa6780d9e6bde10aec10a875c039fdbbc652e.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_deterministic.hip -> fmha_ck_autogen_06ba94794a14f0f0022af6f5f3c16e1e16959d4c.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_4b1eaca3c37a82d19f8dc91f06764170069ca3af.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_91c916e14198f6d18dc89915e379b01070434e91.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_deterministic.hip -> fmha_ck_autogen_8e816fcad5e9ecfca94a6491eb2274bcc41e558b.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16.hip -> fmha_ck_autogen_5fc66c5b53f83bf1e023e81e9d51f0285b3ae731.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2d9c659ba43bb907fd4e3e36a50958288bafd1a3.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask.hip -> fmha_ck_autogen_07ff04fcc273e469737512893ea3fb5876ac131d.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_deterministic.hip -> fmha_ck_autogen_22632f996eb63fbe4bc5748c5897b775087446a0.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16.hip -> fmha_ck_autogen_f5f1797f6b672a55476348571ce17645c8a62869.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_iglp_pssk_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_eee408cf9456ff977aa7d12345e9b2f1e60639f1.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv.hip -> fmha_ck_autogen_303b7b04496e4db7c1ba2436485dc7c8a4c88448.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi.hip -> fmha_ck_autogen_fcb0b08e29b2e1bf181fceceb9dc416e54f52b00.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_deterministic.hip -> fmha_ck_autogen_d06ba4c996570ddab77b6ff1e2a0101b638543eb.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16.hip -> fmha_ck_autogen_fc5ebf0f2200f37ccc0849e0c3745f6e2f00111d.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_dropout_wg16_deterministic.hip -> fmha_ck_autogen_2caba3ab83239e474412fcf89fe0fbef97e51bf1.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_dc184767d723f4995791848cdc68bd948408204f.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_deterministic.hip -> fmha_ck_autogen_c53e295b68e807774ed31bb914e4bc59312a77d7.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16.hip -> fmha_ck_autogen_db0d0cf55d90b3f3c9eecada1db93c420f34b1ae.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_alibi_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_d1c25cfc437d8bd803860e39a45b2f3b9fa48393.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_deterministic.hip -> fmha_ck_autogen_01ca79005067e20e4eed5a72ff9187cde702cd1c.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16.hip -> fmha_ck_autogen_a5e5cae764142683b70d3344cf07dd1edb7d69e2.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_dropout_wg16_deterministic.hip -> fmha_ck_autogen_ca920c3239bb5796b1ab2fc75177eb3b820aa784.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask.hip -> fmha_ck_autogen_806f9ab9baf631df1d3a8d801e4cf93a102526cf.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_deterministic.hip -> fmha_ck_autogen_4b30f472f00bec9da0564ddc40e07112b5f9a117.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16.hip -> fmha_ck_autogen_dc039d422a57c159ea4dbcc867d766ff1b356a07.hip -fmha_bwd_d64_fp16_group_b32x128x64x32x64x32x32x64x64_r1x4x1_r4x1x1_r1x4x1_w16x16x32_w16x16x16_o1_kr_ktr_vr_psskddv_mask_dropout_wg16_deterministic.hip -> fmha_ck_autogen_5b55946ff3c15a44b9c741e9f6bbbcb5bd4c8577.hip -fmha_bwd_dot_do_o_d128_bf16_batch_o2.hip -> fmha_ck_autogen_658552954505a2092662071401e135e84956c4c0.hip -fmha_bwd_dot_do_o_d128_bf16_batch_o2_pdv.hip -> fmha_ck_autogen_53bd60bd2afee49b30a583c32a45ae9f2076db08.hip -fmha_bwd_dot_do_o_d128_bf16_batch_o2_ps.hip -> fmha_ck_autogen_8e675919a6c7758cbbeecb83b7ac6c62f95cdb46.hip -fmha_bwd_dot_do_o_d128_bf16_batch_o2_psdv.hip -> fmha_ck_autogen_2d06f77a4054ca615d96636c0e2eba2a89850142.hip -fmha_bwd_dot_do_o_d128_bf16_group_o2_ps.hip -> fmha_ck_autogen_187963e1969301abfa61d06afc97faea2bb4efb1.hip -fmha_bwd_dot_do_o_d128_bf16_group_o2_psdv.hip -> fmha_ck_autogen_e7153f9a9b0b7c54ddf2debbe297efcffbb4fcfa.hip -fmha_bwd_dot_do_o_d128_fp16_batch_o2.hip -> fmha_ck_autogen_3c3b7e4b8c1efe59f79a15512716fce2282a79a7.hip -fmha_bwd_dot_do_o_d128_fp16_batch_o2_pdv.hip -> fmha_ck_autogen_19cd9f7b08cec83736605af63d9fcaf463a1aea4.hip -fmha_bwd_dot_do_o_d128_fp16_batch_o2_ps.hip -> fmha_ck_autogen_b4588379eaa268d79fe8f8e4457b009f204a5fb7.hip -fmha_bwd_dot_do_o_d128_fp16_batch_o2_psdv.hip -> fmha_ck_autogen_23c9b46da8774462de8c24e14b12df3ed596eb57.hip -fmha_bwd_dot_do_o_d128_fp16_group_o2_ps.hip -> fmha_ck_autogen_5b413bdc825ae863d53dab548f2145dc0de8fd37.hip -fmha_bwd_dot_do_o_d128_fp16_group_o2_psdv.hip -> fmha_ck_autogen_58a7ab44bbd9fbc97c7805860d5f6ac81d6ae468.hip -fmha_bwd_dot_do_o_d256_bf16_batch_o2.hip -> fmha_ck_autogen_50f887556a3540609649744957651ca667b91774.hip -fmha_bwd_dot_do_o_d256_bf16_batch_o2_pdv.hip -> fmha_ck_autogen_eac5952f46f4f2bf06257b00661774eeed48a323.hip -fmha_bwd_dot_do_o_d256_bf16_batch_o2_ps.hip -> fmha_ck_autogen_efaa0cb33c71cb8ca7b83dd0e7a6c7b01f6b50a9.hip -fmha_bwd_dot_do_o_d256_bf16_batch_o2_psdv.hip -> fmha_ck_autogen_71e5fb3544dafa9da03fd2de4bb9bd0718f6009f.hip -fmha_bwd_dot_do_o_d256_bf16_group_o2_ps.hip -> fmha_ck_autogen_3fad30ff0739ab5dede67a96e859f8c474c245f8.hip -fmha_bwd_dot_do_o_d256_bf16_group_o2_psdv.hip -> fmha_ck_autogen_4bef4d120e71bfcfe61d67aa44d24ceb907c2b9e.hip -fmha_bwd_dot_do_o_d256_fp16_batch_o2.hip -> fmha_ck_autogen_7d0f767c17385eb7d756cbe8ed444d7cef72dea5.hip -fmha_bwd_dot_do_o_d256_fp16_batch_o2_pdv.hip -> fmha_ck_autogen_4b68e4d00295b294320b94bc777d7d34609127e0.hip -fmha_bwd_dot_do_o_d256_fp16_batch_o2_ps.hip -> fmha_ck_autogen_33746071156e9ad46f403a539dc237e0a44122a7.hip -fmha_bwd_dot_do_o_d256_fp16_batch_o2_psdv.hip -> fmha_ck_autogen_3d45624dc6e33c477c73a155500b015b6c010de8.hip -fmha_bwd_dot_do_o_d256_fp16_group_o2_ps.hip -> fmha_ck_autogen_8250f27341241086515d833aa53ae873d4ece3fa.hip -fmha_bwd_dot_do_o_d256_fp16_group_o2_psdv.hip -> fmha_ck_autogen_8793dc3217e154b65ebba065aa10ab4dc2374ae8.hip -fmha_bwd_dot_do_o_d32_bf16_batch_o2.hip -> fmha_ck_autogen_1a11dd5ebb989503a1c182684e7f247e2f8cd9c2.hip -fmha_bwd_dot_do_o_d32_bf16_batch_o2_pdv.hip -> fmha_ck_autogen_e16075c3a5fcfe63ba12e854bb1fed6873f014ab.hip -fmha_bwd_dot_do_o_d32_bf16_batch_o2_ps.hip -> fmha_ck_autogen_937801fbb43fb6797f0425f08d13926b74d87c4a.hip -fmha_bwd_dot_do_o_d32_bf16_batch_o2_psdv.hip -> fmha_ck_autogen_fecffa403b3631b1957e1a9a06f18fdb3b4eee5f.hip -fmha_bwd_dot_do_o_d32_bf16_group_o2_ps.hip -> fmha_ck_autogen_5ba578c0e7abf1127dd0370f06d7278656c93ab9.hip -fmha_bwd_dot_do_o_d32_bf16_group_o2_psdv.hip -> fmha_ck_autogen_345a939a2491166dc520e9a2b9de7e43671e0c2b.hip -fmha_bwd_dot_do_o_d32_fp16_batch_o2.hip -> fmha_ck_autogen_7393267865f1c2b0aa1a09a586f54cec98eea4ae.hip -fmha_bwd_dot_do_o_d32_fp16_batch_o2_pdv.hip -> fmha_ck_autogen_93b885d6869400b0dc2ef1b2c2636ddfd21cde31.hip -fmha_bwd_dot_do_o_d32_fp16_batch_o2_ps.hip -> fmha_ck_autogen_38f8a89468cf9c8606cf12a930db062a83cd0ea0.hip -fmha_bwd_dot_do_o_d32_fp16_batch_o2_psdv.hip -> fmha_ck_autogen_f974b12e83e214c30995a25631d37df1478927af.hip -fmha_bwd_dot_do_o_d32_fp16_group_o2_ps.hip -> fmha_ck_autogen_2bb6da1095bd8669c0e48b5cd808cf0dcefa2674.hip -fmha_bwd_dot_do_o_d32_fp16_group_o2_psdv.hip -> fmha_ck_autogen_0e0a2370f2a320484d8f9f21e3197425c2dbe9ad.hip -fmha_bwd_dot_do_o_d64_bf16_batch_o2.hip -> fmha_ck_autogen_a9f00f270680de81df7737e848e0408cb070e68b.hip -fmha_bwd_dot_do_o_d64_bf16_batch_o2_pdv.hip -> fmha_ck_autogen_61220f6dca850a5b5ccf1f619a267c40c37efeca.hip -fmha_bwd_dot_do_o_d64_bf16_batch_o2_ps.hip -> fmha_ck_autogen_b192c55f002d8540d5f965cc4df0c2e33f4b9ff9.hip -fmha_bwd_dot_do_o_d64_bf16_batch_o2_psdv.hip -> fmha_ck_autogen_295a523f815eb822d66162d4feb75fe0bc50b648.hip -fmha_bwd_dot_do_o_d64_bf16_group_o2_ps.hip -> fmha_ck_autogen_292b4f995d622826af5d1f2bffa7ba68467c841a.hip -fmha_bwd_dot_do_o_d64_bf16_group_o2_psdv.hip -> fmha_ck_autogen_5e840be0741afa4d41fd4789c8300223fdc63ddc.hip -fmha_bwd_dot_do_o_d64_fp16_batch_o2.hip -> fmha_ck_autogen_0e1dbc9c433ce8ec33ace9e62550261d613db582.hip -fmha_bwd_dot_do_o_d64_fp16_batch_o2_pdv.hip -> fmha_ck_autogen_6eebd0c2fbfc85f938b10535855c388971129a28.hip -fmha_bwd_dot_do_o_d64_fp16_batch_o2_ps.hip -> fmha_ck_autogen_0bc7910aac798f0555e9e505ad7f177c9fbbd92c.hip -fmha_bwd_dot_do_o_d64_fp16_batch_o2_psdv.hip -> fmha_ck_autogen_18b92b4e249195ac3e0c74d246585a4c9e0992fd.hip -fmha_bwd_dot_do_o_d64_fp16_group_o2_ps.hip -> fmha_ck_autogen_278639d44a4a8372a627a7c31e9527c8faa26f97.hip -fmha_bwd_dot_do_o_d64_fp16_group_o2_psdv.hip -> fmha_ck_autogen_8e938d0e3ad30db201880642e57758285b2ec4cb.hip -fmha_fwd_api.hip -> fmha_ck_autogen_1ca3f45d0be2d1119cccd0af042a3e8adeda2ed7.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv.hip -> fmha_ck_autogen_f727911254904ce4341e4ff5f8bafc430b8cfbbf.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi.hip -> fmha_ck_autogen_54208a6e8c5263e38f9ffcb062564ab61d2785ff.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_dropout.hip -> fmha_ck_autogen_1d3ef3d5ded0dfe2a0bafb52ea8f841658db35fd.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse.hip -> fmha_ck_autogen_f15c41ddb04ec7f80235bb3db19198dd6b699713.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse_dropout.hip -> fmha_ck_autogen_a5c4dc0d70c547dbbfb661e879ba7f9adfafc2ea.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_d7290cc4c3036c9205e689cbcc60e7d16b97a7d6.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_dropout.hip -> fmha_ck_autogen_0b2647b5982405a48e8c8888552a4b89386ccdd9.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse.hip -> fmha_ck_autogen_eb278488b2cca114adca5e4614d86f92447f937a.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_29fe68ba10b3480dddc9866c51ca8b5efe962cc3.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_dropout.hip -> fmha_ck_autogen_92992be6252f2afdc368bd4baec4b8a55ae0abf8.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse.hip -> fmha_ck_autogen_501dcf3213efd214cc2ce8c9ba0027f991d241b4.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse_dropout.hip -> fmha_ck_autogen_aa6d13b09f85ee62bb5018608812181fb43afc86.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask.hip -> fmha_ck_autogen_d0f63cafbeb445408c884727b473667fb479675e.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_dropout.hip -> fmha_ck_autogen_7596c14b8fee751d03f42ca48ea4f66e87fc2e2f.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse.hip -> fmha_ck_autogen_c2b719893a4d8a1e71857966d399f06c0a41749c.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse_dropout.hip -> fmha_ck_autogen_071751b1012b90f7b57f8591cd06ae1fd27d9cd3.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_d00f65bc99ca08eba66564d34f72f2769bff9491.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_2273457ac3be01cc1595a015a5f598f8290c77e4.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_63c411351ec59bdbed2590c599f9eddf7807b371.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_042a156e9eb935555ab14a84461959b466c2fb5b.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_eab6cdc59bf216f7045f0cf5f221bb91ec415cd2.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_d703eea8075cacec4d41fee7dc4734f593ee79e8.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_2f32f2d658f1f69840fbad511ce8a3851c859d52.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_e6973d75297bd2c3432a7c88e8a9ee1c9ae693bf.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_854c8003a508ed3f8cbe6967c4ae2635a491c721.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_ceb9544e2a0caae2c9e3dd8bbd2c509e8dca1379.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_e83c604d1b8260958becd1c7c209745ff9151715.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_3b26eafe76cca8e74e819220b6de1f4279d48e43.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_d5e82799f4452e148c3e02acd6526cf30757eb52.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_5435b4651a90e331fcdcf224282457e3dc038a30.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_1573e3d855d28c54af612ab950b081302891d56d.hip -fmha_fwd_d128_bf16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_4e47f8fa40332c6ed12d9971e0b539049a871c34.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_b285e2f1970b78e18002464eeda63798229bbc3a.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_75f21e38ad01fade35b1db40adabd75eb602410c.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_81f6c575c3fa2ccc7e65022f1ba65c8cfc16541e.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_45b9871c220c0065d74bffeed4021d0304a9625c.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_f028af9e5e3c25800dde938e991aaab4fc1d64aa.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_7fa76fc1b066a15b08dc6c24a7cf33a58b4cb6cb.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_157768cd725813f8111d265cfdfea7f42034e5e9.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_541874a7633e5713720b9d084b6d1c6715a51a17.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_6f88527a2cdb5adf51407f4661a254bb32d7de23.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_a55b47aafc4340e69e300ac61a7601a5c14513b7.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_20d5c3c86398f6ce55abc90db3e362dbf9f457f2.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_8cf1007430da272174d3476d042f398627e83512.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_e7d37e7ee96c392fa24c02a9143438a3a7d05741.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_dc91797c1474a368e9cb056b50b4629d7736c3cb.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_3cce3baac1e3ca03af0c3f4ee4d0158ad1031e9f.hip -fmha_fwd_d128_bf16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_9d6759d8855c4c6289f1f241a1628cf0406c1b64.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv.hip -> fmha_ck_autogen_b38a1d3cffae01332a3a9d9472ff1b2c443e82af.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi.hip -> fmha_ck_autogen_2cf351fc2c2da4a8e1760a3affc9a5947c6b3bda.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_dropout.hip -> fmha_ck_autogen_bafbef3f13d429ec3e9f4672218998d5669d79f2.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse.hip -> fmha_ck_autogen_3f34433b784d1e405ade3378918641372a30bf6b.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse_dropout.hip -> fmha_ck_autogen_5fb062527121e627871b3f1b2a94b96c42e51205.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_5732094f5917e9164ee0f973ac6ec47245a69101.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_dropout.hip -> fmha_ck_autogen_688aaa193f332ed13e017e78ec07a7c80e45f6c5.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse.hip -> fmha_ck_autogen_1cbf88db44aa5f884438288a325270d29c7a04b6.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_2660282ad39ef034fecbdb74acedfb48620b7dfd.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_dropout.hip -> fmha_ck_autogen_a59423c095db052603d77073d409534bceef425f.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse.hip -> fmha_ck_autogen_3fcc6893456a559c7d22714116022fc69b372266.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse_dropout.hip -> fmha_ck_autogen_c7568e11e44ce70924d27e683190422cfae5c31d.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask.hip -> fmha_ck_autogen_f79def2b4edf6d18f6ef1d6b141f9e0435441f6a.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_dropout.hip -> fmha_ck_autogen_32652a27e8605cef59c8341813b68e7513be23c5.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse.hip -> fmha_ck_autogen_b20e314642cf565e4f32bceffdb5c0e653ab627b.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse_dropout.hip -> fmha_ck_autogen_a74b0e7dd816ad08eec5a1bba6e227afee9813ec.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_a968df29f5ae1463706b7981b3bde55918e1aa65.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_b5248f443a12d96815c04409a00102923c717023.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_291a8bdf9d63b112e7fe5fa7e8835a6789cb8ecf.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_6d5aad18f59e47a3fa3278c7ef1a6372830c33d5.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_c063318cb851ccaa923be12d34c84d839bc64bb8.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_a5a7833f4597bb03a3e845d5580d677e97421040.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_2d9a04b7f41dd6f0db017157a44790f35c626e2d.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_98f5efcd500ce6b9ffc14bc9877e0ba457539925.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_135ea67de101135ed5fe04f5cab1ec1d7b3714bb.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_951343832a5bfd060c8d12da0d8a090f070a717d.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_f24d42e820adc1a26a428d59df7ffdd7f8580176.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_4dbdd9c3f496a27bde68cf86374999ff2dd53505.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_3be7cea6df8e6dd56194e1172f28943667f1c4ef.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_483eaea4096c8f5bee16a64860432f0634a253d8.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_103186dbad604763008e0204a1ea90baecef8877.hip -fmha_fwd_d128_fp16_batch_shb_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_367e58867c46d96c9bbaa96eaaa9f93595c9e099.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_311104394c8bef8d4ecff35c1409221e723a5a8a.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_976cf509d9c2bf86ba6ee5ded544fa8e6717f590.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_24410fd9a4150c33186a2a365d06d8f6ea621c20.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_b493c99888d82cd2852bfb101f99a2e6a27665b8.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_1fda1c96568eab89a8f6498f8bb23c1223cdc7b0.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_053981d9e7af2ebc0f91e61ac5e25cbe68c95bd8.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_3110540b50e95e99a5cccebe47d9d3a83093c2fb.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_1fcdcb750f382fc7828a9886585f50efbe5be735.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_c3d0eaf9399c863d672e8c08d123739bab837d4b.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_9d69d441f48f9ea346dd8e00376a9a708da3ad87.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_3992d5df4ba2e999caf6889a852db4e1ba078e65.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_f30316cfe49323638f71ba688dd8ff9b2266b335.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_797750ac0b18b48f56ceb4640256e9bd3a36621a.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_942439e4f5644a3a4630481bc7d98834b29b6e1c.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_fac99c3c82b77946f6844699d2333cd532a78a26.hip -fmha_fwd_d128_fp16_group_hbs_b128x128x32x128x32x128_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_98f9a4f4d85f292b78123599a2e1798f12aa545b.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr.hip -> fmha_ck_autogen_ea591185b1c5f521023e250a26f742984255b241.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi.hip -> fmha_ck_autogen_48300e0aeabe337785d4c7b41796ce65df6cc42a.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_dropout.hip -> fmha_ck_autogen_e514c6b4bc75d95a150104a17972abae77cb47ed.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_lse.hip -> fmha_ck_autogen_a64b4cf3f6706e4b4e0af4402e2263b9a1585f9b.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_lse_dropout.hip -> fmha_ck_autogen_e389d0e4442cd8304081892ddc75043e68a6398c.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask.hip -> fmha_ck_autogen_ab43f4a56c166dad0113f51b337a083f4df7cdb6.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask_dropout.hip -> fmha_ck_autogen_d4645b713821371161a9925dec8a3d6c157ba1aa.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask_lse.hip -> fmha_ck_autogen_0b90a0186d8b8004e3f19886c7992c8e04d0e066.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_d34d6cdcd81a456125ab5e0875466c6334d8e5c8.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_dropout.hip -> fmha_ck_autogen_d0b09e8513646fbb2a007544a63ec9e2b04dc4c2.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_lse.hip -> fmha_ck_autogen_ca3d98ff43fbb80ceb82fc22ab039bee898969b0.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_lse_dropout.hip -> fmha_ck_autogen_7ea9c37d92e344f3cc58cd4d1d00f19167e3623e.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask.hip -> fmha_ck_autogen_db85839ee8d464c5a81b8dad9839f5e0f4b467a8.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask_dropout.hip -> fmha_ck_autogen_32527660fa7aeb9a951a9f2fc3c53989bd141c48.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask_lse.hip -> fmha_ck_autogen_528db08068589c6e4c096054d26a2e5be63285b6.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask_lse_dropout.hip -> fmha_ck_autogen_d600779c17b7b21c18e1308e6d765fe02a7945d3.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv.hip -> fmha_ck_autogen_445e28a8a51cd435130ded2abc9fc606e522c713.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi.hip -> fmha_ck_autogen_8a980749c6b2a18c80426dd189e5506334343ca4.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_da822ea727fb3543e445e4000f7e6ebb946d6a3b.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_f525b59df454ccf53da6cb201e0aa8d09f52a2ad.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_0a2b116fd5065109aae46ee547e4f49ad0e9d6e1.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_366662dccf2f650bcd8123c49006c759cd4c0ef6.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_816c48e129a0235cb3a19124ddb28cce286fb368.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_356f83cb96d0313abcdb24955edd4264df72aed7.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_0e661b5f30566d1f159f060c264849c7ae4772f1.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_dropout.hip -> fmha_ck_autogen_61a9e92183ba87924e73ff0b5e25bd12d6038e69.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse.hip -> fmha_ck_autogen_e502730dea6987e2c038446c448aa08bdcc23113.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_f851da732f397624717160f89271514bc334b59b.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask.hip -> fmha_ck_autogen_fd345632e0cae0d549ba79626a08b1885711deb6.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_937c48d0b7096ad6c8bc445f13f2c8c1934695ab.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_a2482a64659c838f3da55f56e3cbbee1dbfe6722.hip -fmha_fwd_d256_bf16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_f34fdb8294257d951dcc9c4fa7ecf1192568b91b.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv.hip -> fmha_ck_autogen_0aafb881e34a3794970a1282af740b3f19c138b1.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi.hip -> fmha_ck_autogen_c250ea59ab6e1ee39cce15cbd3f181047cdee31a.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_4ce671f5defd76ca08614a7a1f184c36c0f1e2ab.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_b9627f9c8d0088df0364a64643f2b5dcd951f2bb.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_a6461d72fb6ba50e81de3f661528c96dcfdc3f3c.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_aa82d20635e592edbf00439294835f6f39ad54a3.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_146eb8c40e3146e06936f3141b2c4d92a578ddec.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_c28de8f96c8315877031a2d56261e95fee6aef44.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_39422621a00ff79b2f5ec0dafb957c77693537b3.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_dropout.hip -> fmha_ck_autogen_a0a556c9358ddd6db719458c81d2d6d822a895da.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse.hip -> fmha_ck_autogen_c2fcced07cc194a8050bc7b2f791453b3f5b2064.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_210ef512b7862837f54acbc3b21e135a192647a3.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask.hip -> fmha_ck_autogen_bef3bd014a918feddadc98eed92a7734f9bcd890.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_ae1ab1f4bbe86bb9bbc22e4774648076c321136f.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_52a8a323414448c50571a334f29bc0a38919b61d.hip -fmha_fwd_d256_bf16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_204a573ce6b7d2f90aede543939315561cc43177.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr.hip -> fmha_ck_autogen_d8901a63986cc28ef24cab012b32114851a8c1ec.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi.hip -> fmha_ck_autogen_12d5c8a4988efe60ef7943ecd73e18a28a736583.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_dropout.hip -> fmha_ck_autogen_e5b65fc519ea7cfcd19f7eddbc3acad6842ff558.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_lse.hip -> fmha_ck_autogen_743176ecb1f0bc800c870861585edf56f88d7739.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_lse_dropout.hip -> fmha_ck_autogen_6b0ef67ce0f178aa2863c4909f5bdd7f766c9b2f.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask.hip -> fmha_ck_autogen_ef40f0acf1885096efb840ec5600ec421c4db331.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask_dropout.hip -> fmha_ck_autogen_523e5bf45ec5008aa3aba4773e68a78e122b2fe7.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask_lse.hip -> fmha_ck_autogen_55cda610c235987e13232e828f8d86fa88030560.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_566b4782793c6526bfce7362efbf6bf069928b2b.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_dropout.hip -> fmha_ck_autogen_cfec97bdfb6fa95e057eaf5a8138853e1c0884f2.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_lse.hip -> fmha_ck_autogen_6905ba47078abd7a5b6a51eb93b26095517e7f70.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_lse_dropout.hip -> fmha_ck_autogen_8840e8899b4e632714632450bcef001c6070f955.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask.hip -> fmha_ck_autogen_d867098db97b3f26e71a151c63b74260bfab21f8.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask_dropout.hip -> fmha_ck_autogen_bc238fd2095b26a167b41cdec8280182330b7b25.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask_lse.hip -> fmha_ck_autogen_b737410b404a51043fc3bd503c0b107c297e4c9f.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_mask_lse_dropout.hip -> fmha_ck_autogen_b4a5715b550f67b8870ba66e1e6282a26cc1dbf3.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv.hip -> fmha_ck_autogen_12207f4b6e7fac27d6c16493a5373f448a2aaae8.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi.hip -> fmha_ck_autogen_7d5667b27f15a06d4040354fba3601d48bb9c045.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_2695783ae8f0034692efd6563f789ef03fd0f4f3.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_60801d21c14796c08377349ec86a6c800af497b7.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_159ee1f1b44d1a8fbaead65d8449413bb616d15e.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_9f0517550c7a23882b95de451e8099ea2186b4ce.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_80f51f0e178c33e6196df1d2e47bd38bf5391cc8.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_489e7be0f85656d012a6451b65f6c1d2613b187d.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_e7de729aa50c10d8101ef504138c3769e3286753.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_dropout.hip -> fmha_ck_autogen_25b3225da1e1842f83592971a1f62a0fe30aa9d3.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse.hip -> fmha_ck_autogen_ce4714e4f33340859c106a3129993e22652262e2.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_bc4e0f0496a34d2fb43c80ce0162ad4183f29064.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask.hip -> fmha_ck_autogen_a9d2be18e2d53a5144f97dfdebb225fcb6d611d3.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_4ab5d6e8fbfd92e9f7e47bda5cfbb0d4162a6319.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_ac1ccde31b47e0e56ee0daab6403fed7895208c7.hip -fmha_fwd_d256_fp16_batch_shb_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_5cd03e29403ad53d6d52e5e81182ea6ff5aff2be.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv.hip -> fmha_ck_autogen_2005aca3520b171bb82d10ad70fef44f28c19776.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi.hip -> fmha_ck_autogen_c402e84359b2037a29efd1d6ce7213ba7605ab25.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_95061acc6650fc7b79fa1fe5b2b1e083555eec2c.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_1fd9fa7c2e13d0bad5fddb2b5a316bbc09d397ea.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_dd9494d9ac35eba6794a4f9120d2db9932596ef8.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_82d7f61e6313930f063758b61102e7a43b118beb.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_2b50073f6dfeb7ea77d5dce288a1d2f08f8f6362.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_5fa7fafd4227918e0c7f0c6ca3b2bd673cd07279.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_522a2a9435103ed405dc1500d31652f1d431a49d.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_dropout.hip -> fmha_ck_autogen_4b7393d55600c9892558248f4131fc06a6cf3309.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse.hip -> fmha_ck_autogen_d66c30148a6fa816937f2f095802264d3dfa0273.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_8f7166d4bb0c1c9b9999ba16a1adbf09ebfdb6f1.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask.hip -> fmha_ck_autogen_80cf0997573f4bcfbaaf75e40f519580a7495a17.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_48d7d145f96aa8958a9208d0c8887742a8c834fd.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_bb111b7acc269f8d5e70915d3efde4c425aa5f5c.hip -fmha_fwd_d256_fp16_group_hbs_b128x128x32x256x32x256_r4x1x1_r4x1x1_w32x32x16_qr_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_48435e5dd23e49e19dd313f9891ffec800ce74c2.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv.hip -> fmha_ck_autogen_e2762543d3380185e304f84749a70db1b8d3dd8c.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi.hip -> fmha_ck_autogen_5093976cb7b32a8bd28ce92fc13af00a3e21f737.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_dropout.hip -> fmha_ck_autogen_efc6a7b25710f0626c3af534111b161e1459d2e1.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse.hip -> fmha_ck_autogen_a8a744edfa3a19d1493611df5bd0d4d59b707d43.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse_dropout.hip -> fmha_ck_autogen_e95e3908479965856843317c8b0c42a6961dfd23.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_2b5317b6cde327a842170ebff20c2b03d81379ff.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_dropout.hip -> fmha_ck_autogen_99ae680eed89ea93a3a94586bd5a68dbc5439f37.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse.hip -> fmha_ck_autogen_1edaf9d4270d2ac61c299320e06ba73f44730364.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_0a4e76d89b175e1d9fd2e9fb908d5fce1ebb945d.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_dropout.hip -> fmha_ck_autogen_fba47fa8d9b5375bc408af68b67345ab9dba2eb8.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_lse.hip -> fmha_ck_autogen_830e3532f27b391585d5de90f3bdf97992b67651.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_lse_dropout.hip -> fmha_ck_autogen_66a020f728df204ff51e37d2ddc21afb0aad5e7b.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask.hip -> fmha_ck_autogen_07c3fc96d2bebe546dce6ebf46e5c7a519959599.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask_dropout.hip -> fmha_ck_autogen_74d5f2aef029f2103bb419cc982cae99fd1a9253.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask_lse.hip -> fmha_ck_autogen_58a784fb478ff5b3f1e2da9765a3a777efda92e3.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask_lse_dropout.hip -> fmha_ck_autogen_0766e7aa4b263a811408b285213e47176ee2bdaf.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_bbe23201fbebed25781f249e5c77c31e0e7f9ddb.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_7a890b126da2d8cfbf84f048b779cac2dd56b509.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_58679919fcd292a2a69543de0db94e2985c9d364.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_84fc5e94f89d6a9287cf64662a372784511468dd.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_3bed3aaf24c73073c604a3b23bb4b0358b8e3490.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_dc5ba6d73f331c76e696953606c5b347b6a46f3f.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_b4f12f10d7b968e0d8e7c23f36d3a360de74a905.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_41b68458076e6cb129d3ec793e95b91430a0c8a1.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_56ffe9e21362afe9c3a407c09d5de186954931a6.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_2ba934408c75da5479cc41f96b98ea7d333635ea.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_bd6aa39d0ae3c87d011610cdb5e2e317f337c454.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_aece14f7a220222eb4ce6783ec2b9fce6fde94b8.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_6e240106c771ebea461fc2a87b6da68e510aba70.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_232f61bf31dbb5de5d7039d5ff2338068a759b68.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_e0e48d7edfe9513f24ad9fae68cac3aa940b17dd.hip -fmha_fwd_d32_bf16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_bc897852a4ca992961843144f4ec4f8b86dd5e9d.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_f1246d1013d954a9316f4432c986d3be9459c548.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_6a4b6226b355bf35d4d07aaef1828091f03ad2ec.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_2b49a9b0801a06dd89c7f7182d7590b515df1592.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_50e7b11019fc2299d70869253877319b03388244.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_7f9bb3486fee7b7c9e24300b8a4e4ce88a11bfc0.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_6785dcec0197fdbb50124ab06efa627f1a2c0567.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_f87991cb7787a29d3ce4711b4ce04c5fb6a14ca9.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_bc4425e30a0b17e8b31726817e8d3177b5c51934.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_54940ce53998becf9bddf56df7d19894a7658168.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_ebb241b947a0adfc8e50c5d71765c14af24593ae.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_3d3f3eb2f5eb1f3287879604892b1c230df85f1d.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_7b9a3bf1a9b37e0bd9bae6249609e5994dc0dba1.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_14221590b90c48d3cf259fb4e834ccfaf7f3209b.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_445cd8fa559588f4264ce6192f2de3e3065365ea.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_7a902ed4ae3cc6558c73b730ff3949778007a230.hip -fmha_fwd_d32_bf16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_0682150e93f547e00f13cd8984779bf49b91e50c.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv.hip -> fmha_ck_autogen_d86e4dcbe9c4cac8f7c8c5d97ce384ae0cbdbfbc.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi.hip -> fmha_ck_autogen_1df893ee660d37fba7eaca452ae65b3e45a73087.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_dropout.hip -> fmha_ck_autogen_92739f4464512feee083b875e11e11eee4f5b448.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse.hip -> fmha_ck_autogen_65910c8b7a30acc731948ab58467fdbe4fe32f6d.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse_dropout.hip -> fmha_ck_autogen_df5b1c6758d4b8540158299dd0362297083084c2.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_ec7fc24902b1ebd8f2bf8088b0ecf6de8be8362d.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_dropout.hip -> fmha_ck_autogen_9e51083e13aa4dfa8c969f8f916835a8e5e9ca39.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse.hip -> fmha_ck_autogen_b41ea5293bc1c56efa2c4b5681d965aa6f2ce6c3.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_813e60e8405aca3f7fbed19452ae37574ada9a77.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_dropout.hip -> fmha_ck_autogen_0ebacd06455ab20eba78b389462946716b5819f6.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_lse.hip -> fmha_ck_autogen_15b255dde1a9d915e582ee2a83de7d83190c6a24.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_lse_dropout.hip -> fmha_ck_autogen_7b2d3680c3578c7292349b58843aef7a82e0087d.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask.hip -> fmha_ck_autogen_1d21263e16dafe79b9fe2f998847296e575c14e7.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask_dropout.hip -> fmha_ck_autogen_2d23a26e0a59a8323dd97632e610d24624143fbe.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask_lse.hip -> fmha_ck_autogen_4fa883a36a76edb276a66c5d779294f170d6d4b7.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psddv_mask_lse_dropout.hip -> fmha_ck_autogen_9207a63fc55c411c73e4f93306c5ffed800dd249.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_0a68c2f9a3acdd787b81be455cbc7836c8bfd90c.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_4217a48a1677bd26cd48e512f1fc8830a8a551b8.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_2b0bcb241e5a1be1d35366461408d06e095a26ef.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_f3193ea266f3718398bc5622f8bc7042c3527a42.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_bb28a4e95723e3df380f98b5ac107c4df353850b.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_61204f6805d5d830aa6fca2a9b5f238ed63c3a73.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_6649f19deeaea20663bee781af7edced7f7a4fc0.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_d3784fb4c0685d7b651f4113f3c71e050881f3a5.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_ed6bdf67720e938d538a867548ac3579b8238169.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_971a08c2e48d805b295d979b24173a04cf58def0.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_c4997f79435cf64add10506acb97d0647cfbb3d4.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_188a70d526394e254274df95de0727850820326c.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_661b49505cfecbe4ec3e5c7371de3aaaa85ac9d5.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_d63c8c746055851217a514321cd735eaf6937263.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_745705ae121a1a331527cedfe4d31218a428a0df.hip -fmha_fwd_d32_fp16_batch_shb_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_6fa6478cc27e52fd9511fbff38369c921155cfb9.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_4fa4d21931b9afcbd70b1567995d3eeb6f9308aa.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_d43715cce8935439f90172d141050d78c7e76fb7.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_ae1afeb6cfdf860ff08e4c2f11c922fd5bfa621a.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_f24bd5b92ce6bba640b8ec6b4e53fe35902c5572.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_481415463f0316ebe25ff2fda47c68cc54db3359.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_db5016bff9e5dc37184d2b9417eb351c7ea1c322.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_d64b8b52f4a98801e185e2f132b2f80c29dd0c37.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_09ecb6347009f6a5d5530a6acf90f9f40288cbcf.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_50e59bd079f4d205b613056f975fd2b4e372ab10.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_fd10a3b937e9659716925e39a01d794914b08e26.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_ec51d24ab5f24e003ed6751ae8ae5b327892b15a.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_a5f8b7b2a891aa9f2ab49762eb31d835efdf18b6.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_9a0a70932bd587759df1e5e150b25b0126d7b529.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_9d3d274058bc0a3d4d35d90669587761fdfbdba1.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_23914c00690ac5c4f89cdbbaf00732ba66c5c0ef.hip -fmha_fwd_d32_fp16_group_hbs_b128x64x16x32x32x32_r2x1x1_r2x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_0befed50a89d80c22b2c8c3d5ba67d73c3d0190e.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv.hip -> fmha_ck_autogen_88c04463f9c5ce565a9daa8c22e16de80fadd707.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi.hip -> fmha_ck_autogen_01e8aedb7b7d77f44a46b2e9b7a826f245aaf4a7.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_dropout.hip -> fmha_ck_autogen_beae876d6da465687f162136231f15767cc7bb14.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse.hip -> fmha_ck_autogen_26f90358e522d7bb7c76c3a2c6010f0f38788bb6.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse_dropout.hip -> fmha_ck_autogen_d7bda8157fb27d544e049fd7d2ec735725f1bf44.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_9fb389d4b5ba590baa951f17da06f0e53d2bfa55.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_dropout.hip -> fmha_ck_autogen_428ce4e14cf94b284ffa735fe03d923cc74c9fe0.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse.hip -> fmha_ck_autogen_900d7f81c73b35ea64095d01c5d48d9190839e0a.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_d2daccc4b3a0f90bff39cb4597f8b7e484613d9e.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_dropout.hip -> fmha_ck_autogen_f280e1639680ac1e5830a21f921bfe2cf364ef42.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse.hip -> fmha_ck_autogen_0dde401aa76cb5425563cbbdb0362748148da3ca.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse_dropout.hip -> fmha_ck_autogen_dc62a8db637d32e7dfdb2521cbdae6e1fbbd5fd1.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask.hip -> fmha_ck_autogen_4cd3de43cc1f7588d62a10362f59d113ee818846.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_dropout.hip -> fmha_ck_autogen_224f9af5e5ca519b21b71a54acb49f50b4999c47.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse.hip -> fmha_ck_autogen_4c8720923c3452e3aebd7b9c1b4b23f0c35d7e4f.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse_dropout.hip -> fmha_ck_autogen_2c7aede7762a524a7a424cc4dc46e43fdedf73a2.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_a98925d99dc484da41dd55700e151cf545cf821d.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_1c65ba6dba01da9caa84ba89453b61d81376763f.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_4b76e5dce9af523422782dd25d8dcf6f25edc68f.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_fe245e9ea974adce2b9807d33b9ba12d916eaffb.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_281d897ad17d7f6db2741b396e6b85a9b8f35286.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_31a968898f0bc6366313e41eddb5e3a3ed12dc98.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_52688999141a72e61322140db29043ef9f7fbc3d.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_92b722cdabcfaa388ccc6ccceb7e42462f3bdcd1.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_47f3ced9b5ddb0dfee8ed5e7df8eca0bbe273047.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_d2dfdb42c1b380e860aa5609302f29698dd27923.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_7fe409f4421193fb48a54aa5f26bd6229d23204c.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_b3a104733f678193068d8642d6560faa03897258.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_df66feebc9a0dcc508ce002c255154622875e524.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_8fa4c40e244b412a07933d369704bcdaa6d5e74c.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_60efa9c427dc278c0d1bc31189f683cd45e4d873.hip -fmha_fwd_d64_bf16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_8e50ea8dd480012cbe10be392cd26d1870e6ef9b.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_e5ccd5f7ddc894b2717112cbfc766804e02b7bd1.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_4911bdd71351610d55916d452495e599960d0a41.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_d2d08c5470a385d0160b2c1441fd1c30fff1c17c.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_012c0f480917c329f4c3c6c666cf32af2d82b294.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_0bb81407c8a2b3cdc5fecf655b3ad64d5d729cc9.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_7ff65c7abd9b0d8a2df9302d6dc167637b3a72f0.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_d712f23ef88ae5d7b161d36f42d22a5ba53b6354.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_5bc803342862aa30e23e5be7d84e611bc571c529.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_0ace6e29e1d3060c3086c08fe27b471e375f9c75.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_54ff49018f1c12b9fa31e523ad40b9cc162ba34d.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_351425a006aeeff4d69c8570cb6bf1e1427d2c21.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_fcb6ef39c3db49f26f736d6c9221dd825409ec4e.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_f98a6b193fec3203eaa75819f6b51aa45a48f212.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_2d446754d7000673779d15d3e73039fd3c10a720.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_ca00cfdc5592b7440d72482a18781e9cf3afb05a.hip -fmha_fwd_d64_bf16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_1211733062ed30b876f1d63bffa642d77e258dd6.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv.hip -> fmha_ck_autogen_9b6d08e63b9a90f2524cbfa8c5fcf8b82a1d2d36.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi.hip -> fmha_ck_autogen_e52e3053f30f780f346fa6b7a836ad2554cb85df.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_dropout.hip -> fmha_ck_autogen_3ecf565a5a1c4a09887c67ac3b9a019dca427ac0.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse.hip -> fmha_ck_autogen_52a89981a05963efcea7ba5c1e967638beeebbbb.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_lse_dropout.hip -> fmha_ck_autogen_2173b7c710d418f44dc2b41bec5905024334eae5.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask.hip -> fmha_ck_autogen_b1ad101ce91348266d3885afdf2996a0fdb72135.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_dropout.hip -> fmha_ck_autogen_4da9e9b7277bc90518ab92860bef2097ba96d982.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse.hip -> fmha_ck_autogen_7e1bdde812c332c9fc58613698568a04771b9fa8.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_1acf2f892742b1d236d2b31a8185c6869126adad.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_dropout.hip -> fmha_ck_autogen_155bafb551768855c8c01faa63e44764ebe6c110.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse.hip -> fmha_ck_autogen_f053c9c32518b895daaa3521827f37af78836fb8.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_lse_dropout.hip -> fmha_ck_autogen_adf160741a4f751d2f15d6eb23d4121cdca62b55.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask.hip -> fmha_ck_autogen_34c2db98d8e2e690f499f41cfd5afb831b756f54.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_dropout.hip -> fmha_ck_autogen_0789852b0cd3cc030c78b28f2fd5b6b0546382a4.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse.hip -> fmha_ck_autogen_532a6ffd8a21d3e98342fd401f0247f62ca4e038.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psddv_mask_lse_dropout.hip -> fmha_ck_autogen_d0daa59f5dce6fc3965193ae37d8c82a3d1834e6.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_4a4a00bd6ea27ff20a2903d619e1361b5e27672a.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_93054acb8a9508fd0f0f486367fb62454de47c39.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_b774450ebadaacf23e944aaf8ca90eada01e8a5a.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_2a833fc01e88bd8e256ef64ae8251dd0ed10720b.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_aa522b43c5e5ea69bcabb4c0fe28def2bd081a12.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_67fb736c61088b8dd92fe0371f5c98e23bf9077f.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_b5c3131fb8e5a25bd4a14bc9075eb6fa01b61d02.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_d7fae2c18645d36a181a0bdd2d8ca7a4ac0f6d1d.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_c355189ade9b1a8269230232db754a3881b53168.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_e035773419a9b3631698a3d375d829af55f7731e.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_d992eab7de49033f5480c5e86a69e675db0d2a19.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_5382a30dcf702daae19bd6705864bfe36e09502c.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_167f5328b035ed59a6f05dfee31edd704c4b07ee.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_c1b94e19d762ddc33cc4e94c6675d93cbde21e3d.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_606f5e0b99814b0a82a731de36f28024bc317801.hip -fmha_fwd_d64_fp16_batch_shb_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_0ad9d68fcee021437e13ffdf94d78252205f5a31.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv.hip -> fmha_ck_autogen_85156f2c556c6ef6180608c361b7b35ede71ffea.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi.hip -> fmha_ck_autogen_890aa875ac13957f00b30210477924697abf0c9e.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_dropout.hip -> fmha_ck_autogen_3108502fd29d3a24b32177bcea968121ee809115.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse.hip -> fmha_ck_autogen_d66b79c4ebdcfd239cecec58203606bc123bd6bb.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_lse_dropout.hip -> fmha_ck_autogen_5efe77ca5c394a60af0313072cdd132216a52bf3.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask.hip -> fmha_ck_autogen_772016803aa3ca6ebe785557118365f9be7c4339.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_dropout.hip -> fmha_ck_autogen_93728d999ae43ee1b5a16e60b90cf8533c7d303f.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse.hip -> fmha_ck_autogen_a1cba1509c413c870c5d784410855ee1bd737da2.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_alibi_mask_lse_dropout.hip -> fmha_ck_autogen_c59ab718fa23f24f09a713ac28a339208a7a5802.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_dropout.hip -> fmha_ck_autogen_afcafd07c1f56e74373ccf37db35976023456d50.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse.hip -> fmha_ck_autogen_ebb9abf5b09e63cbe76390bb46ff7cbefb3141f0.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_lse_dropout.hip -> fmha_ck_autogen_419461cdb5687ebbb7bf0be136071d70420c1619.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask.hip -> fmha_ck_autogen_4beca56234ff6fb4f23b9b24822887fd9a3d0df9.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_dropout.hip -> fmha_ck_autogen_a8a4af070ee46d802cb11086b93daf91538f8a04.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse.hip -> fmha_ck_autogen_79f182ae021e23869d7bebf2a9b4575bdc910ed0.hip -fmha_fwd_d64_fp16_group_hbs_b128x64x32x64x32x64_r4x1x1_r4x1x1_w32x32x16_qr_async_vr_psskddv_mask_lse_dropout.hip -> fmha_ck_autogen_770ad1eb1b30ad8f1e7c17df486093129b2d5630.hip diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.sh b/aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.sh deleted file mode 100644 index 0dc441e87ec3..000000000000 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/rename_ck_autogen_files.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -set -ex - -file_renaming_txt="rename_ck_autogen_files.output.txt" -rm -rf $file_renaming_txt -for file in `ls fmha_*wd*hip`; do - sha1=$(sha1sum $file | cut -d' ' -f1) - new_file="fmha_ck_autogen_${sha1}.hip" - mv $file $new_file - echo "$file -> $new_file" >> $file_renaming_txt -done From 84b3d7d1380ce8611607cb98aab395fde939acb0 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 18 Feb 2025 21:14:47 +0000 Subject: [PATCH 37/46] mha_varlen_fwd plumbing and replacing alibi_slopes --- .../native/transformers/cuda/attention.cu | 3 +- .../hip/flash_attn/ck/me_ck_api.h | 3 +- .../hip/flash_attn/ck/me_fwd_ck.hip | 5 +- .../hip/flash_attn/ck/mha_fwd_ck.hip | 4 +- .../hip/flash_attn/ck/mha_varlen_fwd_ck.hip | 48 +++++++++++-------- .../transformers/hip/flash_attn/flash_api.h | 7 +-- 6 files changed, 32 insertions(+), 38 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index d6bec9e29f8f..67f820782f7d 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1195,8 +1195,7 @@ std::tuple _efficient_ seqstart_q, seqstart_k, std::nullopt,// not passing in optional gen_ - seqused_k,// not passing in optional seqused_k_ - alibi_slopes);// not passing in optional alibi_slopes_ + seqused_k);// not passing in optional seqused_k_ } else { // use aotriton diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h index 439073c2c631..5a284c7ac6b7 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -30,8 +30,7 @@ mem_eff_forward_ck( const std::optional& seqstart_q, const std::optional& seqstart_k, std::optional gen_, - std::optional& seqused_k_, - std::optional& alibi_slopes_ + std::optional& seqused_k_ ); // TODO get return tensors correct diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index db8ca2d66f6b..858a92e7549e 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -28,8 +28,7 @@ mem_eff_forward_ck( const std::optional& seqstart_q, const std::optional& seqstart_k, std::optional gen_, - std::optional& seqused_k_, - std::optional& alibi_slopes_) { + std::optional& seqused_k_) { std::cout << std::endl; std::cout << "MADE IT INTO MY CODE " << std::endl; @@ -50,7 +49,6 @@ mem_eff_forward_ck( k, // k v, // v out_, // opt(out_) - alibi_slopes_, // opt(alibi_slopes) p_dropout, // p_dropout scale.value(), // opt(softmax_scale) is_causal.value(), // opt(is_causal) @@ -73,7 +71,6 @@ mem_eff_forward_ck( cu_seqlens_q.value(), // cu_seqlens_q cu_seqlens_k.value(), // cu_seqlens_k seqused_k_, // opt(seqused_k) - alibi_slopes_, // opt(alibi_slopes) max_seqlen_q, // max_seqlen_q max_seqlen_k, // max_seqlen_k p_dropout, // p_dropout diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index 7b4cd9ffec98..863172e18aea 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -57,7 +57,6 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, // v: (batch_size, seqlen_k, nheads_k, d) // o: (batch_size, seqlen_q, nheads, d) - // alibi_slopes:(batch_size, nheads) or (nhead) // attn_bias: (batch_size, nheads, seqlen_q, seqlen_k) // lse: (batch_size, nheads, seqlen_q) // randval: (batch_size, nheads, seqlen_q, seqlen_k) @@ -133,7 +132,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, nhead_stride_q, nhead_stride_k, nhead_stride_v, - 0, // nhead_stride_bias, FA without bias + 0, // nhead_stride_bias, FA without bias : TODO_ANDY CHECK IF WE NEED TO DO SOMETHING WITH THIS nhead_stride_randval, nhead_stride_lse, nhead_stride_o, @@ -157,7 +156,6 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size std::optional &out_, // batch_size x seqlen_q x num_heads xhead_size - std::optional &alibi_slopes_, // num_heads or batch_size x num_heads: Not used const float p_dropout, const float softmax_scale, bool is_causal, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip index e6b95824f84f..20ad315d3025 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip @@ -13,7 +13,7 @@ fmha_fwd_traits get_ck_fmha_varlen_fwd_traits(const mask_info &mask, int head_size, bool has_dropout, bool has_lse, - bool enable_alibi) + bool enable_bias) { return fmha_fwd_traits{head_size, head_size, @@ -21,7 +21,7 @@ fmha_fwd_traits get_ck_fmha_varlen_fwd_traits(const mask_info &mask, true, // is_group_mode true, // is_v_rowmajor mask.type, - enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias, has_lse, has_dropout, false}; // do_fp8_static_quant @@ -42,11 +42,10 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse, const at::Tensor v, const at::Tensor seqlens_q, const at::Tensor seqlens_k, - std::optional &alibi_slopes_, + std::optional &attn_bias_, at::Tensor out, at::Tensor softmax_lse, at::Tensor dropout_randval, - float softmax_scale, float p_dropout, std::pair drop_seed_offset) @@ -56,7 +55,7 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse, // v: (total_k, nheads_k, d) // o: (total_q, nheads, d) - // alibi_slopes:(batch, nheads) or (nhead) + // attn_bias :(batch, nheads, max_seqlen_q, max_seqlen_k) // lse: (batch, nheads, max_seqlen_q) // randval: (nheads, total_q, max_seqlen_k) @@ -84,22 +83,23 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse, ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0; ck_tile::index_t batch_stride_randval = 0; - void *alibi_slopes_ptr = nullptr; - ck_tile::index_t stride_alibi_slopes = 0; - - if (alibi_slopes_.has_value()) { - auto alibi_slopes = alibi_slopes_.value(); - CHECK_DEVICE(alibi_slopes); - TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); - TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({h}) || alibi_slopes.sizes() == at::IntArrayRef({b, h})); - alibi_slopes_ptr = alibi_slopes.data_ptr(); - stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + void *attn_bias_ptr = nullptr; + ck_tile::index_t stride_attn_bias = 0; + + if (attn_bias_.has_value()) { + auto a_b = attn_bias_.value(); + CHECK_DEVICE(a_b); + TORCH_CHECK(a_b.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); + //TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({h}) || alibi_slopes.sizes() == at::IntArrayRef({b, h})); + attn_bias_ptr = a_b.data_ptr(); + //stride_attn_bias = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + stride_attn_bias = a_b.stride(0); } return fmha_fwd_args{q.data_ptr(), k.data_ptr(), v.data_ptr(), - alibi_slopes_ptr, // bias + attn_bias_ptr, // bias has_dropout_randval ? dropout_randval.data_ptr() : nullptr, has_lse ? softmax_lse.data_ptr() : nullptr, out.data_ptr(), @@ -120,7 +120,7 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse, stride_q, stride_k, stride_v, - stride_alibi_slopes, + stride_attn_bias, stride_randval, stride_o, nhead_stride_q, @@ -153,7 +153,6 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 std::optional & /*seqused_k*/, - std::optional &alibi_slopes_, // num_heads or b x num_heads int max_seqlen_q, const int max_seqlen_k, const float p_dropout, @@ -201,7 +200,7 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads const int max_num_blocks_per_seq = 0; const int num_blocks = 0; - if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } // causal=true is the same as causal=false in this case + if (max_seqlen_q == 1 && !attn_bias_.has_value()) { is_causal = false; } // causal=true is the same as causal=false in this case // TODO // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case @@ -308,6 +307,13 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads flash::ParsePhiloxCudaState, dim3(1), dim3(64), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), philox_args, rng_state_ptr); } + // remove const from attn_bias_ + std::optional attn_bias; + if( attn_bias_.has_value()) + { + attn_bias = attn_bias_; + } + if (max_seqlen_k > 0) { auto drop_seed_offset = std::make_pair(rng_state_ptr, rng_state_ptr + 1); @@ -315,7 +321,7 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads ck_tile::stream_config stream_config{stream}; auto traits = - get_ck_fmha_varlen_fwd_traits(mask, q_dtype_str, head_size_8x, has_dropout, has_lse, alibi_slopes_.has_value()); + get_ck_fmha_varlen_fwd_traits(mask, q_dtype_str, head_size_8x, has_dropout, has_lse, attn_bias_.has_value()); auto args = get_ck_fmha_varlen_fwd_args( @@ -332,7 +338,7 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads v_padded, cu_seqlens_q, cu_seqlens_k, - alibi_slopes_, + attn_bias, out, softmax_lse, p, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index aee9d9e1f48b..cbed61132842 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -143,8 +143,6 @@ mha_fwd_ck( const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size std::optional& out_, // batch_size x seqlen_q x num_heads x head_size - std::optional& - alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, const float softmax_scale, bool is_causal, @@ -152,7 +150,7 @@ mha_fwd_ck( int window_size_right, const bool return_softmax, std::optional gen_, - const std::optional& attn_bias_); + const std::optional& attn_bias_); // batch_size x nheads x seqlen_q x seqlen_k std::tuple< at::Tensor, @@ -177,7 +175,6 @@ mha_varlen_fwd_ck( std::optional& seqused_k, // b. If given, only this many elements of each batch // element's keys are used. - std::optional& alibi_slopes_, // num_heads or b x num_heads int max_seqlen_q, const int max_seqlen_k, const float p_dropout, @@ -280,7 +277,6 @@ mha_fwd( k, v, out_, - alibi_slopes_, p_dropout, softmax_scale, is_causal, @@ -369,7 +365,6 @@ mha_varlen_fwd( cu_seqlens_q, cu_seqlens_k, seqused_k, - alibi_slopes_, max_seqlen_q, max_seqlen_k, p_dropout, From 1967d0520ebe6d2f9f25456e011d89e6c1a0c192 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Tue, 18 Feb 2025 22:09:20 +0000 Subject: [PATCH 38/46] Feeding grad_bias through to mha_bwd/varlen_ck --- .../hip/flash_attn/ck/me_bwd_ck.hip | 4 ++ .../hip/flash_attn/ck/mha_bwd_ck.hip | 47 ++++++++++--------- .../hip/flash_attn/ck/mha_varlen_bwd_ck.hip | 42 +++++++++-------- .../transformers/hip/flash_attn/flash_api.h | 14 +++++- 4 files changed, 65 insertions(+), 42 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index ef5a0e9e301c..e02d66688566 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -66,6 +66,8 @@ mem_eff_backward_ck( opt_dK, opt_dV, attn_bias, + bias_requires_grad, + grad_bias, p_dropout, scale, is_causal, @@ -97,6 +99,8 @@ mem_eff_backward_ck( cu_seqlens_q.value(), cu_seqlens_k.value(), attn_bias, + bias_requires_grad, + grad_bias, max_seqlen_q, max_seqlen_k, p_dropout, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index 28bd893da0f3..e2bd80342b50 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -12,7 +12,7 @@ fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask, std::string dtype, int head_size, bool has_dropout, - bool enable_alibi, + bool enable_bias, bool deterministic) { return fmha_bwd_traits{head_size, @@ -20,8 +20,8 @@ fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask, dtype, false, // is_group_mode mask.type, - enable_alibi ? bias_enum::alibi : bias_enum::no_bias, - false, // has_dbias + enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias, + false, // has_dbias // TODO_ANDY revisit has_dropout, false, // s_randval deterministic}; @@ -39,7 +39,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, const at::Tensor q, const at::Tensor k, const at::Tensor v, - std::optional &alibi_slopes_, + std::optional &attn_bias_, const at::Tensor out, const at::Tensor softmax_lse, const at::Tensor dout, @@ -105,25 +105,27 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, ck_tile::index_t stride_dq_acc = dq_acc.stride(2); ck_tile::index_t nhead_stride_dq_acc = dq_acc.stride(3); + //TODO_ANDY: need to add some stuff above for bias + + float p_undrop = 1.0 - p_dropout; - void *alibi_slopes_ptr = nullptr; - ck_tile::index_t stride_alibi_slopes = 0; - - if (alibi_slopes_.has_value()) { - auto alibi_slopes = alibi_slopes_.value(); - CHECK_DEVICE(alibi_slopes); - TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); - TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({h}) || alibi_slopes.sizes() == at::IntArrayRef({b, h})); - alibi_slopes_ptr = alibi_slopes.data_ptr(); - // alibi_slopes:(batch_size, nheads) or (nhead) - stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + void *attn_bias_ptr = nullptr; + ck_tile::index_t stride_attn_bias = 0; + + if (attn_bias_.has_value()) { + auto a_b = attn_bias_.value(); + CHECK_DEVICE(a_b); + TORCH_CHECK(a_b.stride(-1) == 1, "Attention bias tensor must have contiguous last dimension"); + attn_bias_ptr = a_b.data_ptr(); + // attn_bias:(batch_size, nheads, seqlen_q, seqlen_k) + stride_attn_bias = a_b.stride(0); } return fmha_bwd_args{q.data_ptr(), k.data_ptr(), v.data_ptr(), - alibi_slopes_ptr, // bias + attn_bias_ptr, // bias out.data_ptr(), softmax_lse.data_ptr(), dout.data_ptr(), @@ -150,7 +152,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, stride_q, stride_k, stride_v, - stride_alibi_slopes, + stride_attn_bias, stride_o, 0, // stride_randval stride_do, @@ -158,7 +160,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, stride_dq, stride_dk, stride_dv, - 0, // stride_dbias, FA without bias + 0, // stride_dbias, FA without bias TODO_ANDY: will probably need these nhead_stride_q, nhead_stride_k, nhead_stride_v, @@ -204,7 +206,9 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x std::optional &dq_, // batch_size x seqlen_q x num_heads x head_size std::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size std::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size - std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &attn_bias_, // num_heads or batch_size x num_heads + bool bias_requires_grad, + std::optional &grad_bias, const float p_dropout, // probability to drop const float softmax_scale, const bool is_causal, @@ -354,7 +358,7 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x ck_tile::stream_config stream_config{stream}; dq.zero_(); // ck use atomic operation on dq auto traits = - get_ck_fmha_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, alibi_slopes_.has_value(), deterministic); + get_ck_fmha_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, attn_bias_.has_value(), deterministic); auto args = get_ck_fmha_bwd_args( @@ -368,7 +372,7 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x q, k, v, - alibi_slopes_, + attn_bias_, out, softmax_lse, dout_padded, @@ -400,6 +404,7 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x dv = dv.index({"...", at::indexing::Slice(at::indexing::None, head_size_og)}); } + //TODO_ANDY need to return dGrad also return { dq, dk, dv, softmax_d }; } } // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip index d57ead48e2b5..c44423c5d846 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip @@ -14,7 +14,7 @@ fmha_bwd_traits get_ck_fmha_varlen_bwd_traits(const mask_info &mask, std::string dtype, int head_size, bool has_dropout, - bool enable_alibi, + bool enable_bias, bool deterministic) { return fmha_bwd_traits{head_size, @@ -22,7 +22,7 @@ fmha_bwd_traits get_ck_fmha_varlen_bwd_traits(const mask_info &mask, dtype, true, // is_group_mode mask.type, - enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias, false, // has_dbias has_dropout, false, // s_randval @@ -43,7 +43,7 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, const at::Tensor v, const at::Tensor seqlens_q, const at::Tensor seqlens_k, - std::optional &alibi_slopes_, + std::optional &attn_bias_, const at::Tensor out, const at::Tensor softmax_lse, const at::Tensor dout, @@ -113,25 +113,27 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, ck_tile::index_t stride_dq_acc = dq_acc.stride(1); ck_tile::index_t nhead_stride_dq_acc = dq_acc.stride(2); + //TODO_ANDY: Probably need to handle some bias stuff similar to the above + + float p_undrop = 1.0 - p_dropout; - void *alibi_slopes_ptr = nullptr; - ck_tile::index_t stride_alibi_slopes = 0; - - if (alibi_slopes_.has_value()) { - auto alibi_slopes = alibi_slopes_.value(); - CHECK_DEVICE(alibi_slopes); - TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); - TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({h}) || alibi_slopes.sizes() == at::IntArrayRef({b, h})); - alibi_slopes_ptr = alibi_slopes.data_ptr(); - // alibi_slopes:(batch_size, nheads) or (nhead) - stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + void *attn_bias_ptr = nullptr; + ck_tile::index_t stride_attn_bias = 0; + + if (attn_bias_.has_value()) { + auto a_b = attn_bias_.value(); + CHECK_DEVICE(a_b); + TORCH_CHECK(a_b.stride(-1) == 1, "Attention bias tensor must have contiguous last dimension"); + attn_bias_ptr = a_b.data_ptr(); + // attn_bias :(batch_size, nheads, seqlen_q, seqlen_k) + stride_attn_bias = a_b.stride(0); } return fmha_bwd_args{q.data_ptr(), k.data_ptr(), v.data_ptr(), - alibi_slopes_ptr, // bias + attn_bias_ptr, // bias out.data_ptr(), softmax_lse.data_ptr(), dout.data_ptr(), @@ -158,7 +160,7 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, stride_q, stride_k, stride_v, - stride_alibi_slopes, + stride_attn_bias, stride_o, 0, // stride_randval stride_do, @@ -214,7 +216,9 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea std::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 - std::optional &alibi_slopes_, // num_heads or b x num_heads + std::optional &attn_bias_, // b x num_heads x seqlen_q x seqlen_k + bool bias_requires_grad, + std::optional &grad_bias, const int max_seqlen_q, const int max_seqlen_k, // max sequence length to choose the kernel const float p_dropout, // probability to drop @@ -381,7 +385,7 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea ck_tile::stream_config stream_config{stream}; dq.zero_(); // ck use atomic operation on dq auto traits = - get_ck_fmha_varlen_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, alibi_slopes_.has_value(), deterministic); + get_ck_fmha_varlen_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, attn_bias_.has_value(), deterministic); auto args = get_ck_fmha_varlen_bwd_args( @@ -397,7 +401,7 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea v, cu_seqlens_q, cu_seqlens_k, - alibi_slopes_, + attn_bias_, out, softmax_lse, dout_padded, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index cbed61132842..7ea8713c3873 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -201,7 +201,9 @@ std::tuple mha_bwd_ck( std::optional& dv_, // batch_size x seqlen_k x num_heads_k x head_size std::optional& - alibi_slopes_, // num_heads or batch_size x num_heads + attn_bias_, // batch_size x num_heads x seqlen_q x seqlen_k + bool bias_requires_grad, + std::optional& grad_bias, const float p_dropout, // probability to drop const float softmax_scale, const bool is_causal, @@ -229,7 +231,9 @@ std::tuple mha_varlen_bwd_ck( dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor& cu_seqlens_q, // b+1 const at::Tensor& cu_seqlens_k, // b+1 - std::optional& alibi_slopes_, // num_heads or b x num_heads + std::optional& attn_bias_, // num_heads or b x num_heads + bool bias_requires_grad, + std::optional& grad_bias, const int max_seqlen_q, const int max_seqlen_k, // max sequence length to choose the kernel const float p_dropout, // probability to drop @@ -448,6 +452,7 @@ inline std::tuple mha_bwd( #if defined(USE_CK_FLASH_ATTENTION) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { + std::optional non_null_dbias = std::nullopt; return mha_bwd_ck( dout, q, @@ -459,6 +464,8 @@ inline std::tuple mha_bwd( dk_, dv_, alibi_slopes_, + false, // bias_requires_grad + non_null_dbias, p_dropout, softmax_scale, is_causal, @@ -548,6 +555,7 @@ inline std::tuple mha_varlen_bwd #if defined(USE_CK_FLASH_ATTENTION) if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { + std::optional non_null_dbias = std::nullopt; return mha_varlen_bwd_ck( dout, q, @@ -561,6 +569,8 @@ inline std::tuple mha_varlen_bwd cu_seqlens_q, cu_seqlens_k, alibi_slopes_, + false, // bias_requires_grad + non_null_dbias, max_seqlen_q, max_seqlen_k, p_dropout, From ab7740bcc5fb6165b736feb39e8842b3cd0783ca Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 19 Feb 2025 20:15:46 +0000 Subject: [PATCH 39/46] passed dbias through to CK (pre-compile) --- .../hip/flash_attn/ck/mha_bwd_ck.hip | 49 +++++++++++++++---- .../hip/flash_attn/ck/mha_varlen_bwd_ck.hip | 5 +- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index e2bd80342b50..ddbf77750a1b 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -13,7 +13,8 @@ fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask, int head_size, bool has_dropout, bool enable_bias, - bool deterministic) + bool deterministic, + bool bias_requires_grad) { return fmha_bwd_traits{head_size, head_size, @@ -21,7 +22,7 @@ fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask, false, // is_group_mode mask.type, enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias, - false, // has_dbias // TODO_ANDY revisit + bias_requires_grad, // has_dbias has_dropout, false, // s_randval deterministic}; @@ -40,6 +41,8 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, const at::Tensor k, const at::Tensor v, std::optional &attn_bias_, + bool bias_requires_grad, + std::optional &grad_bias, const at::Tensor out, const at::Tensor softmax_lse, const at::Tensor dout, @@ -108,18 +111,39 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, //TODO_ANDY: need to add some stuff above for bias + // dbias: (batch_size, nheads, seqlen_q, seqlen_k) // TODO_ANDY verify this + float p_undrop = 1.0 - p_dropout; void *attn_bias_ptr = nullptr; + ck_tile::index_t nhead_stride_bias = 0; + ck_tile::index_t batch_stride_bias = 0 ck_tile::index_t stride_attn_bias = 0; + // bias: (batch_size, nheads, seqlen_q, seqlen_k) if (attn_bias_.has_value()) { auto a_b = attn_bias_.value(); CHECK_DEVICE(a_b); TORCH_CHECK(a_b.stride(-1) == 1, "Attention bias tensor must have contiguous last dimension"); attn_bias_ptr = a_b.data_ptr(); - // attn_bias:(batch_size, nheads, seqlen_q, seqlen_k) - stride_attn_bias = a_b.stride(0); + stride_attn_bias = a_b.stride(2); + nhead_stride_bias = a_b.stride(1); + batch_stride_bias = a_b.stride(0); + } + + void *dbias_ptr = nullptr; + ck_tile::index_t stride_dbias = 0; + ck_tile::index_t nhead_stride_dbias = 0; + ck_tile::index_t batch_stride_dbias = 0; + // dbias: (batch_size, nheads, seqlen_q, seqlen_k) + if(bias_requires_grad) { + // If bias_requires_grad is true, grad_bias is guaranteed to have a value via line 270 + //grad_bias + dbias = grad_bias.value(); + dbias_ptr = dbias.data_ptr(); + stride_dbias = dbias.stride(2); + nhead_stride_dbias = dbias.stride(1); + batch_stride_dbias = dbias.stride(0); } return fmha_bwd_args{q.data_ptr(), @@ -134,7 +158,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, dq.data_ptr(), dk.data_ptr(), dv.data_ptr(), - nullptr, // dbias + dbias_ptr, // dbias dq_acc.data_ptr(), // dq_acc nullptr, // seqstart_q nullptr, // seqstart_k @@ -160,11 +184,11 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, stride_dq, stride_dk, stride_dv, - 0, // stride_dbias, FA without bias TODO_ANDY: will probably need these + stride_dbias, // stride_dbias, FA without bias TODO_ANDY: will probably need these nhead_stride_q, nhead_stride_k, nhead_stride_v, - 0, // nhead_stride_bias, FA without bias + nhead_stride_bias, // nhead_stride_bias, FA without bias nhead_stride_o, 0, // nhead_stride_randval nhead_stride_do, @@ -173,11 +197,11 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, nhead_stride_dq, nhead_stride_dk, nhead_stride_dv, - 0, // nhead_stride_dbias, FA without dbias + nhead_stride_dbias, // nhead_stride_dbias, FA without dbias batch_stride_q, batch_stride_k, batch_stride_v, - 0 , // batch_stride_bias, FA without bias + batch_stride_bias, // batch_stride_bias, FA without bias batch_stride_o, 0, // batch_stride_randval batch_stride_do, @@ -186,7 +210,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, batch_stride_dq, batch_stride_dk, batch_stride_dv, - 0 , // batch_stride_dbias, FA without dbias + batch_stride_dbias, // batch_stride_dbias, FA without dbias split_stride_dq_acc, mask.left, mask.right, @@ -246,6 +270,9 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + TORCH_CHECK((bias_requires_grad && grad_bias.has_value()) || (!bias_requires_grad), + "If bias_requires_grad is set, grad_bias must have a value"); + const auto sizes = q.sizes(); const int batch_size = sizes[0]; @@ -373,6 +400,8 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x k, v, attn_bias_, + bias_requires_grad, + grad_bias, out, softmax_lse, dout_padded, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip index c44423c5d846..8348f612353a 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip @@ -15,7 +15,8 @@ fmha_bwd_traits get_ck_fmha_varlen_bwd_traits(const mask_info &mask, int head_size, bool has_dropout, bool enable_bias, - bool deterministic) + bool deterministic, + bool bias_requires_grad) { return fmha_bwd_traits{head_size, head_size, @@ -23,7 +24,7 @@ fmha_bwd_traits get_ck_fmha_varlen_bwd_traits(const mask_info &mask, true, // is_group_mode mask.type, enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias, - false, // has_dbias + bias_requires_grad, // has_dbias has_dropout, false, // s_randval deterministic}; From a7152d07b9498ee13867960079734a89f273e9b8 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Wed, 19 Feb 2025 22:40:09 +0000 Subject: [PATCH 40/46] pass dbias (post-compile) --- .../hip/flash_attn/ck/mha_bwd_ck.hip | 12 +++- .../hip/flash_attn/ck/mha_varlen_bwd_ck.hip | 55 ++++++++++++++----- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index ddbf77750a1b..5b5a354433a0 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -117,7 +117,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, void *attn_bias_ptr = nullptr; ck_tile::index_t nhead_stride_bias = 0; - ck_tile::index_t batch_stride_bias = 0 + ck_tile::index_t batch_stride_bias = 0; ck_tile::index_t stride_attn_bias = 0; // bias: (batch_size, nheads, seqlen_q, seqlen_k) @@ -139,7 +139,7 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, if(bias_requires_grad) { // If bias_requires_grad is true, grad_bias is guaranteed to have a value via line 270 //grad_bias - dbias = grad_bias.value(); + auto dbias = grad_bias.value(); dbias_ptr = dbias.data_ptr(); stride_dbias = dbias.stride(2); nhead_stride_dbias = dbias.stride(1); @@ -385,7 +385,13 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x ck_tile::stream_config stream_config{stream}; dq.zero_(); // ck use atomic operation on dq auto traits = - get_ck_fmha_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, attn_bias_.has_value(), deterministic); + get_ck_fmha_bwd_traits(mask, + q_dtype_str, + head_size_8x, + is_dropout, + attn_bias_.has_value(), + deterministic, + bias_requires_grad); auto args = get_ck_fmha_bwd_args( diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip index 8348f612353a..7a969ed62a87 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip @@ -45,6 +45,8 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, const at::Tensor seqlens_q, const at::Tensor seqlens_k, std::optional &attn_bias_, + bool bias_requires_grad, + std::optional &grad_bias, const at::Tensor out, const at::Tensor softmax_lse, const at::Tensor dout, @@ -114,21 +116,37 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, ck_tile::index_t stride_dq_acc = dq_acc.stride(1); ck_tile::index_t nhead_stride_dq_acc = dq_acc.stride(2); - //TODO_ANDY: Probably need to handle some bias stuff similar to the above - - float p_undrop = 1.0 - p_dropout; + //TODO_ANDY: Probably need to handle some bias stuff similar to the above + // bias: (batch_size, nheads, seqlen_q, seqlen_k) void *attn_bias_ptr = nullptr; + ck_tile::index_t nhead_stride_bias = 0; + ck_tile::index_t batch_stride_bias = 0; ck_tile::index_t stride_attn_bias = 0; - if (attn_bias_.has_value()) { auto a_b = attn_bias_.value(); CHECK_DEVICE(a_b); TORCH_CHECK(a_b.stride(-1) == 1, "Attention bias tensor must have contiguous last dimension"); attn_bias_ptr = a_b.data_ptr(); - // attn_bias :(batch_size, nheads, seqlen_q, seqlen_k) - stride_attn_bias = a_b.stride(0); + stride_attn_bias = a_b.stride(2); + nhead_stride_bias = a_b.stride(1); + batch_stride_bias = a_b.stride(0); + } + + void *dbias_ptr = nullptr; + ck_tile::index_t stride_dbias = 0; + ck_tile::index_t nhead_stride_dbias = 0; + ck_tile::index_t batch_stride_dbias = 0; + // dbias: (batch_size, nheads, seqlen_q, seqlen_k) + if(bias_requires_grad) { + // If bias_requires_grad is true, grad_bias is guaranteed to have a value via line 270 + //grad_bias + auto dbias = grad_bias.value(); + dbias_ptr = dbias.data_ptr(); + stride_dbias = dbias.stride(2); + nhead_stride_dbias = dbias.stride(1); + batch_stride_dbias = dbias.stride(0); } return fmha_bwd_args{q.data_ptr(), @@ -143,7 +161,7 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, dq.data_ptr(), dk.data_ptr(), dv.data_ptr(), - nullptr, // dbias + dbias_ptr, // dbias dq_acc.data_ptr(), // dq_acc seqlens_q.data_ptr(), // seqstart_q seqlens_k.data_ptr(), // seqstart_k @@ -169,11 +187,11 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, stride_dq, stride_dk, stride_dv, - 0, // stride_dbias, FA without bias + stride_dbias, // stride_dbias, FA without bias nhead_stride_q, nhead_stride_k, nhead_stride_v, - 0, // nhead_stride_bias, FA without bias + nhead_stride_bias, // nhead_stride_bias, FA without bias nhead_stride_o, 0, // nhead_stride_randval nhead_stride_do, @@ -182,11 +200,11 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, nhead_stride_dq, nhead_stride_dk, nhead_stride_dv, - 0, // nhead_stride_dbias, FA without dbias + nhead_stride_dbias, // nhead_stride_dbias, FA without dbias batch_stride_q, batch_stride_k, batch_stride_v, - 0 , // batch_stride_bias, FA without bias + batch_stride_bias, // batch_stride_bias, FA without bias batch_stride_o, 0, // batch_stride_randval batch_stride_do, @@ -195,7 +213,7 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, batch_stride_dq, batch_stride_dk, batch_stride_dv, - 0 , // batch_stride_dbias, FA without dbias + batch_stride_dbias, // batch_stride_dbias, FA without dbias split_stride_dq_acc, mask.left, mask.right, @@ -265,6 +283,9 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea CHECK_CONTIGUOUS(cu_seqlens_q); CHECK_CONTIGUOUS(cu_seqlens_k); + TORCH_CHECK((bias_requires_grad && grad_bias.has_value()) || (!bias_requires_grad), + "If bias_requires_grad is set, grad_bias must have a value"); + const auto sizes = q.sizes(); const int total_q = sizes[0]; @@ -386,7 +407,13 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea ck_tile::stream_config stream_config{stream}; dq.zero_(); // ck use atomic operation on dq auto traits = - get_ck_fmha_varlen_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, attn_bias_.has_value(), deterministic); + get_ck_fmha_varlen_bwd_traits(mask, + q_dtype_str, + head_size_8x, + is_dropout, + attn_bias_.has_value(), + deterministic, + bias_requires_grad); auto args = get_ck_fmha_varlen_bwd_args( @@ -403,6 +430,8 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea cu_seqlens_q, cu_seqlens_k, attn_bias_, + bias_requires_grad, + grad_bias, out, softmax_lse, dout_padded, From 58f4c62ddfc0c6d41e0fecb2d827111ac73b756a Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 20 Feb 2025 17:14:40 +0000 Subject: [PATCH 41/46] Returning dbias up (pre-compile) --- .../hip/flash_attn/ck/me_bwd_ck.hip | 10 +- .../hip/flash_attn/ck/mha_bwd_ck.hip | 14 ++- .../hip/flash_attn/ck/mha_varlen_bwd_ck.hip | 11 +- .../transformers/hip/flash_attn/flash_api.h | 110 ++++++++++-------- 4 files changed, 87 insertions(+), 58 deletions(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index e02d66688566..dd8059f2d919 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -54,7 +54,8 @@ mem_eff_backward_ck( [dQ, dK, dV, - softmax_d] = + softmax_d, + dBias] = mha_bwd_ck( dout, q, @@ -77,7 +78,7 @@ mem_eff_backward_ck( philox_seed, philox_offset); //TODO_ANDY: make this also return attention bias - return std::make_tuple(dQ, dK, dV, softmax_d); + return std::make_tuple(std::move(dQ), std::move(dK), std::move(dV), std::move(dBias)); } else { // call mha_varlen_bwd_ck @@ -85,7 +86,8 @@ mem_eff_backward_ck( [dQ, dK, dV, - softmax_d] = + softmax_d, + dBias] = mha_varlen_bwd_ck( dout, q, @@ -112,7 +114,7 @@ mem_eff_backward_ck( deterministic, philox_seed, philox_offset); - return std::make_tuple(dQ, dK, dV, softmax_d); + return std::make_tuple(std::move(dQ), std::move(dK), std::move(dV), std::move(dBias)); } diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index 5b5a354433a0..cfade7408dff 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -219,8 +219,8 @@ fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, p_undrop, drop_seed_offset}; } - -std::tuple +//START HERE ANDY - JUST ADDED THE FIFTH RETURN TYPE, MAKE SURE WE ARE RETURNING DBIAS +std::tuple mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size @@ -440,6 +440,14 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x } //TODO_ANDY need to return dGrad also - return { dq, dk, dv, softmax_d }; + at::Tensor dbias; + if(bias_requires_grad) { + dbias = grad_bias.value(); + } else { + dbias = at::empty({batch_size, num_heads, seqlen_q, seqlen_k}, q.options()); + } + + + return { dq, dk, dv, softmax_d, dbias }; } } // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip index 7a969ed62a87..23b85939b3a7 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip @@ -223,7 +223,7 @@ fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, drop_seed_offset}; } -std::tuple +std::tuple mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_heads x head_size const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i @@ -462,7 +462,14 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea dk = dk.index({"...", at::indexing::Slice(at::indexing::None, head_size_og)}); dv = dv.index({"...", at::indexing::Slice(at::indexing::None, head_size_og)}); } + at::Tensor dbias; + if(bias_requires_grad) { + dbias = grad_bias.value(); + } else { + dbias = empty({batch_size, num_heads, seqlen_q, seqlen_k}, q.options()); + } + - return { dq, dk, dv, softmax_d }; + return { dq, dk, dv, softmax_d, dbias }; } } // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h index 7ea8713c3873..f26a914af905 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -187,7 +187,7 @@ mha_varlen_fwd_ck( std::optional gen_, const std::optional& attn_bias_); -std::tuple mha_bwd_ck( +std::tuple mha_bwd_ck( const at::Tensor& dout, // batch_size x seqlen_q x num_heads, x head_size_og const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size @@ -213,7 +213,7 @@ std::tuple mha_bwd_ck( const at::Tensor philox_seed, const at::Tensor philox_offset); -std::tuple mha_varlen_bwd_ck( +std::tuple mha_varlen_bwd_ck( const at::Tensor& dout, // total_q x num_heads, x head_size const at::Tensor& q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i @@ -453,27 +453,33 @@ inline std::tuple mha_bwd( if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { std::optional non_null_dbias = std::nullopt; - return mha_bwd_ck( - dout, - q, - k, - v, - out, - softmax_lse, - dq_, - dk_, - dv_, - alibi_slopes_, - false, // bias_requires_grad - non_null_dbias, - p_dropout, - softmax_scale, - is_causal, - window_size_left, - window_size_right, - deterministic, - philox_seed, - philox_offset); + auto[dQuery, + dKey, + dValue, + dSoftmax, + dBias] = mha_bwd_ck( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + alibi_slopes_, + false, // bias_requires_grad + non_null_dbias, + p_dropout, + softmax_scale, + is_causal, + window_size_left, + window_size_right, + deterministic, + philox_seed, + philox_offset); + // for FA return [dQ, dV, dK, dSoftmax] + return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax)); } else { return mha_bwd_aot( dout, @@ -556,32 +562,38 @@ inline std::tuple mha_varlen_bwd if (at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { std::optional non_null_dbias = std::nullopt; - return mha_varlen_bwd_ck( - dout, - q, - k, - v, - out, - softmax_lse, - dq_, - dk_, - dv_, - cu_seqlens_q, - cu_seqlens_k, - alibi_slopes_, - false, // bias_requires_grad - non_null_dbias, - max_seqlen_q, - max_seqlen_k, - p_dropout, - softmax_scale, - zero_tensors, - is_causal, - window_size_left, - window_size_right, - deterministic, - philox_seed, - philox_offset); + auto[dQuery, + dKey, + dValue, + dSoftmax, + dBias] = mha_varlen_bwd_ck( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + cu_seqlens_q, + cu_seqlens_k, + alibi_slopes_, + false, // bias_requires_grad + non_null_dbias, + max_seqlen_q, + max_seqlen_k, + p_dropout, + softmax_scale, + zero_tensors, + is_causal, + window_size_left, + window_size_right, + deterministic, + philox_seed, + philox_offset); + // for FA return [dQ, dV, dK, dSoftmax] + return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax)); } else { return mha_varlen_bwd_aot( dout, From 49aafd8471e18af3006a741c50fdf7b6dfd3a8fc Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 20 Feb 2025 17:41:20 +0000 Subject: [PATCH 42/46] returning bias up (post-compile) --- .../native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip index 23b85939b3a7..3e31e77a251c 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip @@ -466,7 +466,7 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea if(bias_requires_grad) { dbias = grad_bias.value(); } else { - dbias = empty({batch_size, num_heads, seqlen_q, seqlen_k}, q.options()); + dbias = at::empty({batch_size, num_heads, max_seqlen_q, max_seqlen_k}, q.options()); } From db02be9d44688b2e7ff2e1c7a563dcebe42bcb74 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 20 Feb 2025 19:09:28 +0000 Subject: [PATCH 43/46] Add branch on CK preferred backend1 --- .../transformers/cuda/attention_backward.cu | 171 +++++++++--------- 1 file changed, 85 insertions(+), 86 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index 89fd19383785..bcb4cb45dbc2 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -410,8 +410,8 @@ _efficient_attention_backward( #ifdef USE_ROCM // ROCM Implementation -// if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) -// { + if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) + { std::cout << "BACKWARD CK ATTENTION" << std::endl; const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float(); // Store grad_bias in optional @@ -447,92 +447,91 @@ _efficient_attention_backward( philox_seed, philox_offset); -// } - - - // TODO_ANDY: Put this in the `else` part of the above condish - TORCH_CHECK(!num_splits_key.has_value(), - "ROCM does not support num_split_keys in _efficient_attention_forward"); - TORCH_CHECK(!window_size.has_value(), - "ROCM does not support window_size in _efficient_attention_forward"); - auto ret = aotriton::v2::flash::check_gpu(stream); - if (hipSuccess != ret) { - TORCH_CHECK(false, + } else { + // Use aotriton + TORCH_CHECK(!num_splits_key.has_value(), + "ROCM does not support num_split_keys in _efficient_attention_forward"); + TORCH_CHECK(!window_size.has_value(), + "ROCM does not support window_size in _efficient_attention_forward"); + auto ret = aotriton::v2::flash::check_gpu(stream); + if (hipSuccess != ret) { + TORCH_CHECK(false, "[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs" " (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)") - } - const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); - bool is_causal; - if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { - is_causal = true; - } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { - is_causal = false; - } else { - TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now"); - } - at::Tensor q_t = query.permute({0,2,1,3}); - at::Tensor k_t = key.permute({0,2,1,3}); - at::Tensor v_t = value.permute({0,2,1,3}); - at::Tensor out_t = out.permute({0,2,1,3}); - at::Tensor dq_t = grad_q.permute({0,2,1,3}); - at::Tensor dk_t = grad_k.permute({0,2,1,3}); - at::Tensor dv_t = grad_v.permute({0,2,1,3}); - at::Tensor dout_t = grad_out.permute({0,2,1,3}); - at::Tensor softmax_lse = logsumexp.view({B * nH, max_seqlen_q}); - at::Tensor delta = at::empty_like(softmax_lse).contiguous(); - - hipError_t err; - using aotriton::v2::flash::attn_bwd; - using aotriton::v2::flash::attn_bwd_compact_varlen; - using sdp::aotriton_adapter::mk_aotensor; - using sdp::aotriton_adapter::mk_aoscalartensor; - using sdp::aotriton_adapter::cast_dtype; - aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype())); - if (cu_seqlens_q.has_value()) { - // varlen aka Nested tensor - err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"), - mk_aotensor(k_t, "k"), - mk_aotensor(v_t, "v"), - mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q"), - mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k"), - max_seqlen_q, - max_seqlen_k, - bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4, - softmax_scale, - mk_aotensor(out_t, "out"), - mk_aotensor(dout_t, "dout"), - mk_aotensor(dq_t, "dq"), - mk_aotensor(dk_t, "dk"), - mk_aotensor(dv_t, "dv"), - bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4, - mk_aotensor<2>(softmax_lse, "L"), - mk_aotensor<2>(delta, "delta"), - float(dropout_p), - mk_aoscalartensor(philox_seed), - mk_aoscalartensor(philox_offset), - 0, - is_causal, - stream); - } else { - err = attn_bwd(mk_aotensor(q_t, "q"), - mk_aotensor(k_t, "k"), - mk_aotensor(v_t, "v"), - bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4, - softmax_scale, - mk_aotensor(out_t, "out"), - mk_aotensor(dout_t, "dout"), - mk_aotensor(dq_t, "dq"), - mk_aotensor(dk_t, "dk"), - mk_aotensor(dv_t, "dv"), - bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4, - mk_aotensor<2>(softmax_lse, "L"), - mk_aotensor<2>(delta, "delta"), - float(dropout_p), - mk_aoscalartensor(philox_seed), - mk_aoscalartensor(philox_offset), - 0, - is_causal, - stream); + } + const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); + bool is_causal; + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { + is_causal = true; + } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { + is_causal = false; + } else { + TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now"); + } + at::Tensor q_t = query.permute({0,2,1,3}); + at::Tensor k_t = key.permute({0,2,1,3}); + at::Tensor v_t = value.permute({0,2,1,3}); + at::Tensor out_t = out.permute({0,2,1,3}); + at::Tensor dq_t = grad_q.permute({0,2,1,3}); + at::Tensor dk_t = grad_k.permute({0,2,1,3}); + at::Tensor dv_t = grad_v.permute({0,2,1,3}); + at::Tensor dout_t = grad_out.permute({0,2,1,3}); + at::Tensor softmax_lse = logsumexp.view({B * nH, max_seqlen_q}); + at::Tensor delta = at::empty_like(softmax_lse).contiguous(); + + hipError_t err; + using aotriton::v2::flash::attn_bwd; + using aotriton::v2::flash::attn_bwd_compact_varlen; + using sdp::aotriton_adapter::mk_aotensor; + using sdp::aotriton_adapter::mk_aoscalartensor; + using sdp::aotriton_adapter::cast_dtype; + aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype())); + if (cu_seqlens_q.has_value()) { + // varlen aka Nested tensor + err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"), + mk_aotensor(k_t, "k"), + mk_aotensor(v_t, "v"), + mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q"), + mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k"), + max_seqlen_q, + max_seqlen_k, + bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4, + softmax_scale, + mk_aotensor(out_t, "out"), + mk_aotensor(dout_t, "dout"), + mk_aotensor(dq_t, "dq"), + mk_aotensor(dk_t, "dk"), + mk_aotensor(dv_t, "dv"), + bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4, + mk_aotensor<2>(softmax_lse, "L"), + mk_aotensor<2>(delta, "delta"), + float(dropout_p), + mk_aoscalartensor(philox_seed), + mk_aoscalartensor(philox_offset), + 0, + is_causal, + stream); + } else { + err = attn_bwd(mk_aotensor(q_t, "q"), + mk_aotensor(k_t, "k"), + mk_aotensor(v_t, "v"), + bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4, + softmax_scale, + mk_aotensor(out_t, "out"), + mk_aotensor(dout_t, "dout"), + mk_aotensor(dq_t, "dq"), + mk_aotensor(dk_t, "dk"), + mk_aotensor(dv_t, "dv"), + bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4, + mk_aotensor<2>(softmax_lse, "L"), + mk_aotensor<2>(delta, "delta"), + float(dropout_p), + mk_aoscalartensor(philox_seed), + mk_aoscalartensor(philox_offset), + 0, + is_causal, + stream); + } } #else // USE_CUDA at::Tensor workspace; From 1d727fee533f8dcdb90dd08e3356db99edb2e580 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 20 Feb 2025 21:05:27 +0000 Subject: [PATCH 44/46] Sanity is working E2E needs clean up and final verification --- aten/src/ATen/native/transformers/cuda/attention.cu | 11 +++++++++-- .../native/transformers/cuda/attention_backward.cu | 4 ++++ .../transformers/hip/flash_attn/ck/me_bwd_ck.hip | 1 + .../transformers/hip/flash_attn/ck/mha_bwd_ck.hip | 4 ++++ .../transformers/hip/flash_attn/ck/mha_fwd_ck.hip | 2 ++ test/test_transformers.py | 2 +- 6 files changed, 21 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 67f820782f7d..e84d5e666f59 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -858,6 +858,7 @@ std::tuple _scaled_dot_product_efficient_attenti // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head) // Key -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head) // Value -> Value(Batch x KV_seq_len x Num_heads x Dim_per_head) + std::cout << std::endl; std::cout << "sdpa_ef" << std::endl; std::cout << "q.sizes : " << query.sizes() << std::endl; Tensor q_t = query.transpose(1, 2); @@ -1176,7 +1177,7 @@ std::tuple _efficient_ q, k, v, - logsumexp, + logsumex, seed_t, offset_t, p] = @@ -1197,7 +1198,10 @@ std::tuple _efficient_ std::nullopt,// not passing in optional gen_ seqused_k);// not passing in optional seqused_k_ - + logsumexp = logsumex; + std::cout << "returned IN MEM_EFF_FORWARD SOFTMAX_DEVICE: " << logsumex.device() << std::endl; + std::cout << "other one IN MEM_EFF_FORWARD SOFTMAX_DEVICE: " << logsumexp.device() << std::endl; + std::cout << "logsum shape: " << logsumexp.sizes() << std::endl; } else { // use aotriton auto ret = aotriton::v2::flash::check_gpu(stream); if (hipSuccess != ret) { @@ -1289,6 +1293,7 @@ std::tuple _efficient_ query.options().dtype(at::ScalarType::Float)); } } // CK BACKEND + std::cout << "AFTER MY FWD CODE RAN SOFTMAX_DEVICE" << logsumexp.device() << std::endl; #else // CUDA Implementation cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index()); @@ -1461,6 +1466,8 @@ std::tuple _efficient_ AT_CUDA_CHECK(cudaGetLastError()); #endif // USE_ROCM + std::cout << "COMPUTE_LOG_SUM?????: " << compute_logsumexp << std::endl; + std::cout << "RETURNING FROM MEM_EFF_FWD SOFTMAX_DEVICE: " << logsumexp.device() << std::endl; std::cout << "res dtype: " << res.dtype() << std::endl; return std::make_tuple( std::move(res), diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index bcb4cb45dbc2..fb6d3b32b6d2 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -410,9 +410,12 @@ _efficient_attention_backward( #ifdef USE_ROCM // ROCM Implementation + std::cout << "HITTING CORRECT PATH" << std::endl; + std::cout << "bias_requires_grad: " << bias_requires_grad << std::endl; if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { std::cout << "BACKWARD CK ATTENTION" << std::endl; + std::cout << "EFFICIENT_ATTENTION SOFTMAX_DEVICE: " << logsumexp.device() << std::endl; const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float(); // Store grad_bias in optional std::optional opt_grad_bias = grad_bias; @@ -889,6 +892,7 @@ std::tuple _scaled_dot_product_e sdp::CustomMaskType custom_mask_type = causal ? sdp::CustomMaskType::CausalFromTopLeft : sdp::CustomMaskType::NoCustomMask; + std::cout << "sdpea SOFTMAX_DEVICE" << logsumexp.device() << std::endl; auto [grad_q, grad_k, grad_v, grad_bias] = at::_efficient_attention_backward( grad_out, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index dd8059f2d919..153156ae6ae1 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -50,6 +50,7 @@ mem_eff_backward_ck( // both of these return dq, dk, dv, softmax_d // need to also return attn_bias // call mha_bwd_ck + std::cout << "MEM_EFF_BWD_CK::SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; auto [dQ, dK, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index cfade7408dff..91073a39d5e5 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -5,6 +5,7 @@ #include #include #include +#include namespace pytorch_flash { @@ -262,6 +263,9 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x std::string q_dtype_str = q_dtype == at::kHalf ? "fp16" : "bf16"; CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + std::cout << "ABOUT TO CHECK DEVICE OF SOFTMAX_LSE WEEEE" << std::endl; + std::cout << "BWD SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; + std::cout << softmax_lse << std::endl; CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index 863172e18aea..a45709298cab 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -357,6 +357,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x softmax_scale, p_dropout, drop_seed_offset); + std::cout << "FWD SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; float t = fmha_fwd(traits, args, stream_config); TORCH_CHECK(t >= 0, "invalid argument for fmha_fwd"); } @@ -371,6 +372,7 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size}); softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); } + std::cout << "END OF MHA_FWD_CK SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; return {out, q_padded, k_padded, v_padded, softmax_lse, seed_t, offset_t, p}; } } //namespace pytorch_flash diff --git a/test/test_transformers.py b/test/test_transformers.py index cf8d8461938b..f3ad8fb1edda 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -2624,7 +2624,7 @@ def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int] mask = torch.randn((batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype) with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]): out = F.scaled_dot_product_attention(query, key, value, mask) - #out.sum().backward() + out.sum().backward() @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system") @parametrize("dtype", [torch.float, torch.float16]) From 2877d4c39a3b770dbc5dff0d6421049d403b6ef4 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 20 Feb 2025 21:13:37 +0000 Subject: [PATCH 45/46] cleaned up lse bug traces --- aten/src/ATen/native/transformers/cuda/attention.cu | 10 ++-------- .../native/transformers/cuda/attention_backward.cu | 2 -- .../transformers/hip/flash_attn/ck/me_bwd_ck.hip | 1 - .../transformers/hip/flash_attn/ck/mha_bwd_ck.hip | 3 --- .../transformers/hip/flash_attn/ck/mha_fwd_ck.hip | 2 -- 5 files changed, 2 insertions(+), 16 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index e84d5e666f59..6d76653dcb3c 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1177,7 +1177,7 @@ std::tuple _efficient_ q, k, v, - logsumex, + lse, seed_t, offset_t, p] = @@ -1198,10 +1198,7 @@ std::tuple _efficient_ std::nullopt,// not passing in optional gen_ seqused_k);// not passing in optional seqused_k_ - logsumexp = logsumex; - std::cout << "returned IN MEM_EFF_FORWARD SOFTMAX_DEVICE: " << logsumex.device() << std::endl; - std::cout << "other one IN MEM_EFF_FORWARD SOFTMAX_DEVICE: " << logsumexp.device() << std::endl; - std::cout << "logsum shape: " << logsumexp.sizes() << std::endl; + logsumexp = lse; } else { // use aotriton auto ret = aotriton::v2::flash::check_gpu(stream); if (hipSuccess != ret) { @@ -1293,7 +1290,6 @@ std::tuple _efficient_ query.options().dtype(at::ScalarType::Float)); } } // CK BACKEND - std::cout << "AFTER MY FWD CODE RAN SOFTMAX_DEVICE" << logsumexp.device() << std::endl; #else // CUDA Implementation cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index()); @@ -1466,8 +1462,6 @@ std::tuple _efficient_ AT_CUDA_CHECK(cudaGetLastError()); #endif // USE_ROCM - std::cout << "COMPUTE_LOG_SUM?????: " << compute_logsumexp << std::endl; - std::cout << "RETURNING FROM MEM_EFF_FWD SOFTMAX_DEVICE: " << logsumexp.device() << std::endl; std::cout << "res dtype: " << res.dtype() << std::endl; return std::make_tuple( std::move(res), diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index fb6d3b32b6d2..ef66dffa276b 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -415,7 +415,6 @@ _efficient_attention_backward( if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) { std::cout << "BACKWARD CK ATTENTION" << std::endl; - std::cout << "EFFICIENT_ATTENTION SOFTMAX_DEVICE: " << logsumexp.device() << std::endl; const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float(); // Store grad_bias in optional std::optional opt_grad_bias = grad_bias; @@ -892,7 +891,6 @@ std::tuple _scaled_dot_product_e sdp::CustomMaskType custom_mask_type = causal ? sdp::CustomMaskType::CausalFromTopLeft : sdp::CustomMaskType::NoCustomMask; - std::cout << "sdpea SOFTMAX_DEVICE" << logsumexp.device() << std::endl; auto [grad_q, grad_k, grad_v, grad_bias] = at::_efficient_attention_backward( grad_out, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index 153156ae6ae1..dd8059f2d919 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -50,7 +50,6 @@ mem_eff_backward_ck( // both of these return dq, dk, dv, softmax_d // need to also return attn_bias // call mha_bwd_ck - std::cout << "MEM_EFF_BWD_CK::SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; auto [dQ, dK, diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index 91073a39d5e5..07b3c6c80f49 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -263,9 +263,6 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x std::string q_dtype_str = q_dtype == at::kHalf ? "fp16" : "bf16"; CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); - std::cout << "ABOUT TO CHECK DEVICE OF SOFTMAX_LSE WEEEE" << std::endl; - std::cout << "BWD SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; - std::cout << softmax_lse << std::endl; CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index a45709298cab..863172e18aea 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -357,7 +357,6 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x softmax_scale, p_dropout, drop_seed_offset); - std::cout << "FWD SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; float t = fmha_fwd(traits, args, stream_config); TORCH_CHECK(t >= 0, "invalid argument for fmha_fwd"); } @@ -372,7 +371,6 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size}); softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); } - std::cout << "END OF MHA_FWD_CK SOFTMAX_DEVICE: " << softmax_lse.device() << std::endl; return {out, q_padded, k_padded, v_padded, softmax_lse, seed_t, offset_t, p}; } } //namespace pytorch_flash From 5c9fd0b3789b4c607e697c4402fa0cff7d76b492 Mon Sep 17 00:00:00 2001 From: Andy Lugo Date: Thu, 20 Feb 2025 23:50:30 +0000 Subject: [PATCH 46/46] Chasing varlen bug. saving place --- .../ATen/native/transformers/attention.cpp | 3 +- .../transformers/cuda/attention_backward.cu | 5 +++ .../hip/flash_attn/ck/me_bwd_ck.hip | 3 +- .../hip/flash_attn/ck/me_fwd_ck.hip | 6 +-- .../hip/flash_attn/ck/mha_bwd_ck.hip | 2 +- .../hip/flash_attn/ck/mha_fwd_ck.hip | 3 +- .../hip/flash_attn/ck/mha_varlen_bwd_ck.hip | 1 + .../hip/flash_attn/ck/mha_varlen_fwd_ck.hip | 20 ++++++++- test/test_transformers.py | 42 +++++++++++++------ 9 files changed, 64 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp index b96787cbda5f..7beb24b8cb6c 100644 --- a/aten/src/ATen/native/transformers/attention.cpp +++ b/aten/src/ATen/native/transformers/attention.cpp @@ -74,7 +74,7 @@ #include #include #endif - +#include #include namespace at::native { @@ -741,6 +741,7 @@ Tensor scaled_dot_product_attention( if (attn_mask.has_value()) { attn_mask.value() = preprocess_mask(attn_mask.value(), query_, key, value);; } + //std::cout << "OUTERMOST Q SHAPE: " << query_.sizes() << std::endl; auto out_and_lse = at::_scaled_dot_product_efficient_attention( query_, key, value, attn_mask, compute_logsumexp, dropout_p, is_causal, scale); return std::get<0>(out_and_lse); diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index ef66dffa276b..299ae9b15a19 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -448,6 +448,7 @@ _efficient_attention_backward( false, // zero_tensors philox_seed, philox_offset); + grad_bias = dBias; } else { // Use aotriton @@ -794,6 +795,10 @@ _efficient_attention_backward( TORCH_CHECK(kernel_launched, "cutlassB: no kernel found to launch!"); AT_CUDA_CHECK(cudaGetLastError()); #endif // USE_ROCM + std::cout << "DEVICE_grad_Q: " << grad_q.device() << std::endl; + std::cout << "DEVICE_grad_K: " << grad_k.device() << std::endl; + std::cout << "DEVICE_grad_V: " << grad_v.device() << std::endl; + std::cout << "DEVICE_grad_B: " << grad_bias.device() << std::endl; return std::make_tuple(std::move(grad_q), std::move(grad_k), std::move(grad_v), std::move(grad_bias)); #endif // defined(USE_MEM_EFF_ATTENTION) TORCH_CHECK(false, "USE_MEM_EFF_ATTENTION was not enabled for build.") diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip index dd8059f2d919..b3ab46704f73 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip @@ -35,7 +35,7 @@ mem_eff_backward_ck( at::Tensor philox_offset) { // TODO implement wrapper - std::cout << "HIT MY MEM_EFF ENTRY POINT" << std::endl; + std::cout << "HIT MY MEM_EFF BWD ENTRY POINT" << std::endl; const int non_null_window_left = -1; const int non_null_window_right = -1; @@ -77,7 +77,6 @@ mem_eff_backward_ck( deterministic, philox_seed, philox_offset); - //TODO_ANDY: make this also return attention bias return std::make_tuple(std::move(dQ), std::move(dK), std::move(dV), std::move(dBias)); } else { diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip index 858a92e7549e..ae2652972cce 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip @@ -43,7 +43,7 @@ mem_eff_forward_ck( // need to pass attn_bias to both of these - if(!cu_seqlens_q.has_value()){ + if(!seqstart_q.has_value()){ return mha_fwd_ck( q, // q k, // k @@ -68,8 +68,8 @@ mem_eff_forward_ck( k, // k v, // v out_, // opt(out) - cu_seqlens_q.value(), // cu_seqlens_q - cu_seqlens_k.value(), // cu_seqlens_k + seqstart_q.value(), // cu_seqlens_q + seqstart_k.value(), // cu_seqlens_k seqused_k_, // opt(seqused_k) max_seqlen_q, // max_seqlen_q max_seqlen_k, // max_seqlen_k diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip index 07b3c6c80f49..a859c3bb1133 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip @@ -448,7 +448,7 @@ mha_bwd_ck(const at::Tensor &dout, // batch_size x seqlen_q x dbias = at::empty({batch_size, num_heads, seqlen_q, seqlen_k}, q.options()); } - + std::cout << "MHA_BWD_CK RAN AND COMPLETED" << std::endl; return { dq, dk, dv, softmax_d, dbias }; } } // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip index 863172e18aea..236c1df2f447 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_fwd_ck.hip @@ -225,7 +225,8 @@ mha_fwd_ck(const at::Tensor &q, // batch_size x seqlen_q x seqlen_q = ngroups; num_heads = num_heads_k; } - + std::cout << "MHA_FWD_CK: CHECKING temp_Q SHAPE: " << temp_q.sizes() << std::endl; + std::cout << "MHA_FWD_CK: CHECKING Q SHAPE : " << q.sizes() << std::endl; CHECK_SHAPE(temp_q, batch_size, seqlen_q, num_heads, head_size); CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size); CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size); diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip index 3e31e77a251c..91f11e697cae 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_bwd_ck.hip @@ -471,5 +471,6 @@ mha_varlen_bwd_ck(const at::Tensor &dout, // total_q x num_hea return { dq, dk, dv, softmax_d, dbias }; + std::cout << "TOUCHING VARLEN BWD" << std::endl; } } // namespace pytorch_flash diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip index 20ad315d3025..7d3ed6075620 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_varlen_fwd_ck.hip @@ -165,6 +165,7 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads std::optional gen_, const std::optional& attn_bias_) { + std::cout << "MHA_VARLEN_BWD_CK ENTER" << std::endl; auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16, "FlashAttention only support fp16 and bf16 data type"); @@ -233,7 +234,24 @@ mha_varlen_fwd_ck(const at::Tensor &q, // total_q x num_heads mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // local } - CHECK_SHAPE(q, total_q, num_heads, head_size_og); + + std::cout << "ABOUT TO CHECK Q's shape: " << q.sizes() << std::endl; + std::cout << "total_q : " << total_q << std::endl; + std::cout << "num_heads : " << num_heads << std::endl; + std::cout << "head_size_og: " << head_size_og << std::endl; + + std::cout << "ABOUT TO CHECK K's shape: " << k.sizes() << std::endl; + std::cout << "total_k : " << total_k << std::endl; + std::cout << "num_heads_k : " << num_heads_k << std::endl; + std::cout << "head_size_og: " << head_size_og << std::endl; + + std::cout << "ABOUT TO CHECK V's shape: " << v.sizes() << std::endl; + std::cout << "total_v : " << total_k << std::endl; + std::cout << "num_heads_v : " << num_heads_k << std::endl; + std::cout << "head_size_og: " << head_size_og << std::endl; + + + //CHECK_SHAPE(q, total_q, num_heads, head_size_og); CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); CHECK_SHAPE(v, total_k, num_heads_k, head_size_og); CHECK_SHAPE(cu_seqlens_q, batch_size + 1); diff --git a/test/test_transformers.py b/test/test_transformers.py index f3ad8fb1edda..3bc7a6fe7586 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -2602,8 +2602,11 @@ def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int] torch.backends.cuda.preferred_rocm_fa_library("ck") dtype = torch.float16 make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True) - batch, num_heads, head_dim = 1, 4, 8 - seq_len_q, seq_len_kv = 16, 32 + batch, num_heads, head_dim = 8, 8, 64 + seq_len_q, seq_len_kv = 64, 15 + + #batch, num_heads, head_dim = 1, 4, 8 + #seq_len_q, seq_len_kv = 16, 32 print("") print("batch : " , batch) print("nheads : " , num_heads) @@ -2627,8 +2630,9 @@ def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int] out.sum().backward() @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system") - @parametrize("dtype", [torch.float, torch.float16]) + @parametrize("dtype", [torch.float16]) def test_mem_eff_attention_non_contiguous_mask(self, device, dtype): + torch.backends.cuda.preferred_rocm_fa_library("ck") make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True) batch, num_heads, head_dim = 8, 8, 64 seq_len_q, seq_len_kv = 64, 16 @@ -2642,8 +2646,9 @@ def test_mem_eff_attention_non_contiguous_mask(self, device, dtype): out.sum().backward() @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system") - @parametrize("dtype", [torch.float, torch.float16]) + @parametrize("dtype", [torch.float16]) def test_mem_eff_attention_long_sequence_mask(self, device, dtype): + torch.backends.cuda.preferred_rocm_fa_library("ck") if torch.cuda.get_device_properties('cuda').total_memory < 80 * 2**30: unittest.skip("This test requires substatnial GPU memory.") return @@ -2701,11 +2706,13 @@ def test_singelton_head_dim_stride_ne_1(self, device): scaled_dot_product_attention(query, key, value) @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system") - @parametrize("type", ["dense", "nested"]) + #@parametrize("type", ["dense", "nested"]) + @parametrize("type", ["nested"]) @parametrize("is_contiguous", [True, False]) def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: str, is_contiguous: bool): + torch.backends.cuda.preferred_rocm_fa_library("ck") make_tensor = partial(rand_sdpa_tensor, type=type, device=device, dtype=torch.float16, packed=True) - + batch_size, seq_len, num_heads, head_dim = 32, 64, 16, 64 shape = SdpaShape(batch_size, num_heads, seq_len, head_dim) @@ -2716,7 +2723,9 @@ def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: s query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2) value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2) key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2) - + #print("python_q shape: ", query.size(0)) + #print("python_v shape: ", value.size(1)) + #print("python_k shape: ", key.size(5)) if is_contiguous: query = query.contiguous() key = key.contiguous() @@ -2733,10 +2742,11 @@ def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: s self.assertEqual(actual.contiguous(), math_ref.contiguous(), atol=2e-3, rtol=1e-2) @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system") - @parametrize("type", ["dense", "nested"]) + @parametrize("type", ["dense"]) @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION] if PLATFORM_SUPPORTS_FLASH_ATTENTION else [SDPBackend.EFFICIENT_ATTENTION]) def test_scaled_dot_product_attention_fused_kernels_packed_accuracy(self, device, type: str, fused_kernel: str): + torch.backends.cuda.preferred_rocm_fa_library("ck") def rand_nt(shape): batch, seq_len, num_heads, head_dim = shape tensors = [6 * torch.rand((seq_len, 3 * num_heads * head_dim), device=device, dtype=torch.float32) - 3 @@ -2801,12 +2811,14 @@ def rand_tensor(shape): @parametrize("contiguous_inputs", [True, False]) @parametrize("is_causal", [True, False]) def test_sdp_mem_efficient_grad_against_math(self, device, contiguous_inputs: bool, is_causal: bool): + torch.set_printoptions(profile="full") + torch.backends.cuda.preferred_rocm_fa_library("ck") batch_size, seq_len, num_heads, head_dim = 4, 4, 2, 16 make_tensor = partial(rand_sdpa_tensor, type="dense", device=device, - dtype=torch.float64, requires_grad=True, packed=True) + dtype=torch.float16, requires_grad=True, packed=True) qkv = make_tensor(SdpaShape(batch_size, num_heads, seq_len, head_dim)) - qkv_lp = qkv.detach().clone().to(torch.float32).requires_grad_() + qkv_lp = qkv.detach().clone().to(torch.float16).requires_grad_() query, key, value = qkv.chunk(3, dim=-1) query_lp, key_lp, value_lp = qkv_lp.chunk(3, dim=-1) @@ -2836,13 +2848,19 @@ def test_sdp_mem_efficient_grad_against_math(self, device, contiguous_inputs: bo query_lp, key_lp, value_lp, None, 0.0, is_causal) rand_upward = torch.rand_like(out) - rand_upward_lp = rand_upward.to(torch.float32) + rand_upward_lp = rand_upward.to(torch.float16) out.backward(rand_upward) out_lp.backward(rand_upward_lp) # Cast up and compare - self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=1e-5, rtol=1e-5) + #print(out) + print(qkv.grad) + print("=================================================================") + print(qkv_lp.grad) + #print(out_lp) + self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float16), atol=1e-5, rtol=1e-5) + #self.assertEqual(qkv, qkv_lp.to(torch.float16), atol=1e-5, rtol=1e-5) @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention was not built for this system") @parametrize("contiguous_inputs", [True, False])